From bb2f9c1a4484f38dcd9693abecbbb8408db9f944 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7oise=20Conil?= <francoise.conil@insa-lyon.fr>
Date: Tue, 30 Jan 2024 14:35:54 +0100
Subject: [PATCH] Light refactoring

Extract the duckdb and sqlite queries from the code.
---
 create-table-backend.sql                      | 20 ++++
 extract-all-projects-versions.sql             | 24 +++++
 extract-latest-project-version.sql            | 36 +++++++
 pyproject-latest-to-csv.py                    | 28 +-----
 ...ct-sqlite-get-files-and-extract-backend.py | 96 ++++++++++---------
 query-projects.sql                            | 18 ++++
 6 files changed, 153 insertions(+), 69 deletions(-)
 create mode 100644 create-table-backend.sql
 create mode 100644 extract-all-projects-versions.sql
 create mode 100644 extract-latest-project-version.sql
 create mode 100644 query-projects.sql

diff --git a/create-table-backend.sql b/create-table-backend.sql
new file mode 100644
index 0000000..583cafd
--- /dev/null
+++ b/create-table-backend.sql
@@ -0,0 +1,20 @@
+-- Define the dialect
+-- sqlfluff:dialect:sqlite
+
+-- Set a smaller indent for this file
+-- sqlfluff:indentation:tab_space_size:2
+
+-- Set keywords to be capitalised
+-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
+
+CREATE TABLE IF NOT EXISTS backends
+(
+  repository TEXT,
+  project_name TEXT,
+  project_version TEXT,
+  backend TEXT,
+  nb_uploads INTEGER,
+  uploaded_on TEXT,
+  year INTEGER,
+  path TEXT
+);
diff --git a/extract-all-projects-versions.sql b/extract-all-projects-versions.sql
new file mode 100644
index 0000000..f5803ea
--- /dev/null
+++ b/extract-all-projects-versions.sql
@@ -0,0 +1,24 @@
+-- Define the dialect
+-- sqlfluff:dialect:duckdb
+
+-- Set a smaller indent for this file
+-- sqlfluff:indentation:tab_space_size:2
+
+-- Set keywords to be capitalised
+-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
+
+SELECT
+  project_name,
+  COUNT(project_name) AS nb_uploads,
+  MAX(project_version) AS max_version,
+  LIST(DISTINCT project_version) AS all_versions,
+  MAX(uploaded_on) AS max_uploaded_on,
+  LIST(DISTINCT uploaded_on) AS all_uploaded_on,
+  LIST(DISTINCT repository) AS all_repository,
+  LIST(DISTINCT path) AS all_path
+FROM '*.parquet'
+WHERE
+  (DATE_PART('year', uploaded_on) >= '2018')
+  AND REGEXP_MATCHES(path, 'pyproject.toml$')
+  AND skip_reason = ''
+GROUP BY project_name;
diff --git a/extract-latest-project-version.sql b/extract-latest-project-version.sql
new file mode 100644
index 0000000..16945e8
--- /dev/null
+++ b/extract-latest-project-version.sql
@@ -0,0 +1,36 @@
+-- Define the dialect
+-- sqlfluff:dialect:duckdb
+
+-- Set a smaller indent for this file
+-- sqlfluff:indentation:tab_space_size:2
+
+-- Set keywords to be capitalised
+-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
+
+WITH lpv AS (
+  SELECT
+    project_name,
+    COUNT(project_name) AS nb_uploads,
+    MAX(uploaded_on) AS max_uploaded_on,
+    LIST(DISTINCT uploaded_on) AS all_uploaded_on
+  FROM '*.parquet'
+  WHERE
+    (DATE_PART('year', uploaded_on) >= '2018')
+    AND REGEXP_MATCHES(path, 'pyproject.toml$')
+    AND skip_reason = ''
+  GROUP BY project_name
+)
+
+SELECT
+  ip.repository,
+  ip.project_name,
+  ip.project_version,
+  lpv.nb_uploads,
+  ip.uploaded_on,
+  DATE_PART('year', ip.uploaded_on) AS year,
+  ip.path
+FROM '*.parquet' AS ip
+JOIN
+  lpv
+  ON ip.project_name = lpv.project_name AND ip.uploaded_on = lpv.max_uploaded_on
+WHERE REGEXP_MATCHES(path, 'pyproject.toml$') AND skip_reason = '';
diff --git a/pyproject-latest-to-csv.py b/pyproject-latest-to-csv.py
index 7ebca37..428eb33 100644
--- a/pyproject-latest-to-csv.py
+++ b/pyproject-latest-to-csv.py
@@ -9,33 +9,15 @@ https://duckdb.org/docs/guides/python/execute_sql
 
 import duckdb
 
-ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
-  MAX(project_version) AS max_version, 
-  LIST(DISTINCT project_version) AS all_versions,
-  MAX(uploaded_on) AS max_uploaded_on, 
-  LIST(DISTINCT uploaded_on) AS all_uploaded_on,
-  LIST(DISTINCT repository) AS all_repository,
-  LIST(DISTINCT path) AS all_path
-  FROM '*.parquet'
-  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
-  GROUP BY project_name;
-"""
+with open('extract-all-projects-versions.sql', 'r') as f:
+    ALL_VERSIONS_QUERY = f.read()
 
 res = duckdb.sql(ALL_VERSIONS_QUERY)
+
 res.to_csv("extract-pyproject-all-versions.csv", header=True)
 
-LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
-  MAX(uploaded_on) AS max_uploaded_on, 
-  LIST(DISTINCT uploaded_on) AS all_uploaded_on
-  FROM '*.parquet'
-  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
-  GROUP BY project_name)
-SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads, 
-  ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
-  FROM '*.parquet' as ip
-    JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
-  WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
-"""
+with open('extract-latest-project-version.sql', 'r') as f:
+    LATEST_QUERY = f.read()
 
 # res = duckdb.sql(LATEST_QUERY).show()
 
diff --git a/pyproject-sqlite-get-files-and-extract-backend.py b/pyproject-sqlite-get-files-and-extract-backend.py
index 6d31a15..48c12b8 100644
--- a/pyproject-sqlite-get-files-and-extract-backend.py
+++ b/pyproject-sqlite-get-files-and-extract-backend.py
@@ -42,35 +42,17 @@ import pycodeorg
 
 LOG = logging.getLogger(__name__)
 
-CREATE_BACKEND = """CREATE TABLE IF NOT EXISTS backends
-(repository TEXT,
- project_name TEXT,
- project_version TEXT,
- backend TEXT,
- nb_uploads INTEGER,
- uploaded_on TEXT,
- year INTEGER,
- path TEXT
-);
-"""
-
-QUERY = """SELECT repository, project_name, project_version, 
-nb_uploads, uploaded_on, year, path 
-FROM pyprojects
-"""
-
-INSERT_BACKEND = """INSERT INTO backends 
-VALUES (:repository, :project_name, :project_version,
-        :backend, :nb_uploads, :uploaded_on, :year, :path)
-"""
 
 if __name__ == "__main__":
     start_time = time.time()
 
-    logging.basicConfig(filename='pyproject-backends.log', level=logging.INFO)
+    logging.basicConfig(filename="pyproject-backends.log", level=logging.INFO)
 
     # Create backend table
     # --------------------
+    with open("create-table-backend.sql", "r") as f:
+        CREATE_BACKEND = f.read()
+
     cnx_backend = sqlite3.connect("pyproject_backends.db")
     cur_backend = cnx_backend.cursor()
 
@@ -78,37 +60,44 @@ if __name__ == "__main__":
 
     # Get project data
     # ----------------
-    cnx_proj = sqlite3.connect('extract-pyproject-latest.db')
+    with open("query-projects.sql", "r") as f:
+        QUERY_PROJECTS = f.read()
+
+    cnx_proj = sqlite3.connect("extract-pyproject-latest.db")
     cnx_proj.row_factory = sqlite3.Row
     cur_proj = cnx_proj.cursor()
 
     cur_proj.execute("SELECT COUNT(*) AS nb FROM pyprojects;")
     r = cur_proj.fetchone()
-    total = r['nb']
+    total = r["nb"]
     cpt = 0
 
-    for row in cur_proj.execute(QUERY):
+    for row in cur_proj.execute(QUERY_PROJECTS):
         values = {
-                "repository": row["repository"],
-                "project_name": row["project_name"],
-                "project_version": row["project_version"],
-                "nb_uploads": row["nb_uploads"],
-                "uploaded_on": row["uploaded_on"],
-                "year": row["year"],
-                "path": row["path"]
-                }
+            "repository": row["repository"],
+            "project_name": row["project_name"],
+            "project_version": row["project_version"],
+            "nb_uploads": row["nb_uploads"],
+            "uploaded_on": row["uploaded_on"],
+            "year": row["year"],
+            "path": row["path"],
+        }
 
         # Only fetch the pyproject.toml at the root of the project
         # --------------------------------------------------------
-        parts = values['path'].split("/")
+        parts = values["path"].split("/")
         if len(parts) == 5 and parts[-1] == "pyproject.toml":
-
             # Fetch the file data from the dataset
             # ------------------------------------
             try:
-                data = pycodeorg.get_data(values['repository'], values['project_name'], values['path'])
+                data = pycodeorg.get_data(
+                    values["repository"], values["project_name"], values["path"]
+                )
             except ValueError as e:
-                LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (values['project_name'], e))
+                LOG.error(
+                    "pycodeorg.get_data failed to retrieve %s: '%s'"
+                    % (values["project_name"], e)
+                )
                 continue
 
             # Then get the 'build-backend' value with a toml library
@@ -116,29 +105,45 @@ if __name__ == "__main__":
             try:
                 toml_dict = tomli.loads(data.decode())
             except (tomli.TOMLDecodeError, UnicodeDecodeError) as e:
-                LOG.error("Error reading TOML file for %s: '%s'" % (values['project_name'], e))
+                LOG.error(
+                    "Error reading TOML file for %s: '%s'" % (values["project_name"], e)
+                )
                 continue
 
             # print(f"{toml_dict=}")
 
-            if toml_dict.get('build-system') and toml_dict['build-system'].get('build-backend'):
-                backend = toml_dict['build-system'].get('build-backend')
+            if toml_dict.get("build-system") and toml_dict["build-system"].get(
+                "build-backend"
+            ):
+                backend = toml_dict["build-system"].get("build-backend")
 
-                values['backend'] = backend
+                values["backend"] = backend
                 print(f"{values['project_name']} : {values['backend']}")
             else:
-                values['backend'] = None
+                values["backend"] = None
                 print(f"{values['project_name']} : .......... no backend found")
 
             try:
-                cur_backend.execute(INSERT_BACKEND, values)
+                cur_backend.execute(
+                    """INSERT INTO backends 
+                              VALUES (:repository, :project_name, :project_version,
+                                      :backend, :nb_uploads, :uploaded_on, :year, :path)
+                    """,
+                    values,
+                )
                 cnx_backend.commit()
             except sqlite3.InterfaceError as e:
-                LOG.error("Error writing to sqlite3 for %s: '%s'" % (values['project_name'], e))
+                LOG.error(
+                    "Error writing to sqlite3 for %s: '%s'"
+                    % (values["project_name"], e)
+                )
                 continue
 
         else:
-            LOG.info(f"%s is not a root path for %s" % (values['path'], values['project_name']))
+            LOG.info(
+                f"%s is not a root path for %s"
+                % (values["path"], values["project_name"])
+            )
 
         cpt = cpt + 1
         if cpt % 2500 == 0:
@@ -153,4 +158,3 @@ if __name__ == "__main__":
 
     LOG.info(duration_msg)
     print(duration_msg)
-
diff --git a/query-projects.sql b/query-projects.sql
new file mode 100644
index 0000000..39a726f
--- /dev/null
+++ b/query-projects.sql
@@ -0,0 +1,18 @@
+-- Define the dialect
+-- sqlfluff:dialect:sqlite
+
+-- Set a smaller indent for this file
+-- sqlfluff:indentation:tab_space_size:2
+
+-- Set keywords to be capitalised
+-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
+
+SELECT
+  repository,
+  project_name,
+  project_version,
+  nb_uploads,
+  uploaded_on,
+  year,
+  path
+FROM pyprojects;
-- 
GitLab