From bb2f9c1a4484f38dcd9693abecbbb8408db9f944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7oise=20Conil?= <francoise.conil@insa-lyon.fr> Date: Tue, 30 Jan 2024 14:35:54 +0100 Subject: [PATCH] Light refactoring Extract the duckdb and sqlite queries from the code. --- create-table-backend.sql | 20 ++++ extract-all-projects-versions.sql | 24 +++++ extract-latest-project-version.sql | 36 +++++++ pyproject-latest-to-csv.py | 28 +----- ...ct-sqlite-get-files-and-extract-backend.py | 96 ++++++++++--------- query-projects.sql | 18 ++++ 6 files changed, 153 insertions(+), 69 deletions(-) create mode 100644 create-table-backend.sql create mode 100644 extract-all-projects-versions.sql create mode 100644 extract-latest-project-version.sql create mode 100644 query-projects.sql diff --git a/create-table-backend.sql b/create-table-backend.sql new file mode 100644 index 0000000..583cafd --- /dev/null +++ b/create-table-backend.sql @@ -0,0 +1,20 @@ +-- Define the dialect +-- sqlfluff:dialect:sqlite + +-- Set a smaller indent for this file +-- sqlfluff:indentation:tab_space_size:2 + +-- Set keywords to be capitalised +-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper + +CREATE TABLE IF NOT EXISTS backends +( + repository TEXT, + project_name TEXT, + project_version TEXT, + backend TEXT, + nb_uploads INTEGER, + uploaded_on TEXT, + year INTEGER, + path TEXT +); diff --git a/extract-all-projects-versions.sql b/extract-all-projects-versions.sql new file mode 100644 index 0000000..f5803ea --- /dev/null +++ b/extract-all-projects-versions.sql @@ -0,0 +1,24 @@ +-- Define the dialect +-- sqlfluff:dialect:duckdb + +-- Set a smaller indent for this file +-- sqlfluff:indentation:tab_space_size:2 + +-- Set keywords to be capitalised +-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper + +SELECT + project_name, + COUNT(project_name) AS nb_uploads, + MAX(project_version) AS max_version, + LIST(DISTINCT project_version) AS all_versions, + MAX(uploaded_on) AS max_uploaded_on, + LIST(DISTINCT uploaded_on) AS all_uploaded_on, + LIST(DISTINCT repository) AS all_repository, + LIST(DISTINCT path) AS all_path +FROM '*.parquet' +WHERE + (DATE_PART('year', uploaded_on) >= '2018') + AND REGEXP_MATCHES(path, 'pyproject.toml$') + AND skip_reason = '' +GROUP BY project_name; diff --git a/extract-latest-project-version.sql b/extract-latest-project-version.sql new file mode 100644 index 0000000..16945e8 --- /dev/null +++ b/extract-latest-project-version.sql @@ -0,0 +1,36 @@ +-- Define the dialect +-- sqlfluff:dialect:duckdb + +-- Set a smaller indent for this file +-- sqlfluff:indentation:tab_space_size:2 + +-- Set keywords to be capitalised +-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper + +WITH lpv AS ( + SELECT + project_name, + COUNT(project_name) AS nb_uploads, + MAX(uploaded_on) AS max_uploaded_on, + LIST(DISTINCT uploaded_on) AS all_uploaded_on + FROM '*.parquet' + WHERE + (DATE_PART('year', uploaded_on) >= '2018') + AND REGEXP_MATCHES(path, 'pyproject.toml$') + AND skip_reason = '' + GROUP BY project_name +) + +SELECT + ip.repository, + ip.project_name, + ip.project_version, + lpv.nb_uploads, + ip.uploaded_on, + DATE_PART('year', ip.uploaded_on) AS year, + ip.path +FROM '*.parquet' AS ip +JOIN + lpv + ON ip.project_name = lpv.project_name AND ip.uploaded_on = lpv.max_uploaded_on +WHERE REGEXP_MATCHES(path, 'pyproject.toml$') AND skip_reason = ''; diff --git a/pyproject-latest-to-csv.py b/pyproject-latest-to-csv.py index 7ebca37..428eb33 100644 --- a/pyproject-latest-to-csv.py +++ b/pyproject-latest-to-csv.py @@ -9,33 +9,15 @@ https://duckdb.org/docs/guides/python/execute_sql import duckdb -ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads, - MAX(project_version) AS max_version, - LIST(DISTINCT project_version) AS all_versions, - MAX(uploaded_on) AS max_uploaded_on, - LIST(DISTINCT uploaded_on) AS all_uploaded_on, - LIST(DISTINCT repository) AS all_repository, - LIST(DISTINCT path) AS all_path - FROM '*.parquet' - WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == '' - GROUP BY project_name; -""" +with open('extract-all-projects-versions.sql', 'r') as f: + ALL_VERSIONS_QUERY = f.read() res = duckdb.sql(ALL_VERSIONS_QUERY) + res.to_csv("extract-pyproject-all-versions.csv", header=True) -LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads, - MAX(uploaded_on) AS max_uploaded_on, - LIST(DISTINCT uploaded_on) AS all_uploaded_on - FROM '*.parquet' - WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == '' - GROUP BY project_name) -SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads, - ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path - FROM '*.parquet' as ip - JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on - WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''; -""" +with open('extract-latest-project-version.sql', 'r') as f: + LATEST_QUERY = f.read() # res = duckdb.sql(LATEST_QUERY).show() diff --git a/pyproject-sqlite-get-files-and-extract-backend.py b/pyproject-sqlite-get-files-and-extract-backend.py index 6d31a15..48c12b8 100644 --- a/pyproject-sqlite-get-files-and-extract-backend.py +++ b/pyproject-sqlite-get-files-and-extract-backend.py @@ -42,35 +42,17 @@ import pycodeorg LOG = logging.getLogger(__name__) -CREATE_BACKEND = """CREATE TABLE IF NOT EXISTS backends -(repository TEXT, - project_name TEXT, - project_version TEXT, - backend TEXT, - nb_uploads INTEGER, - uploaded_on TEXT, - year INTEGER, - path TEXT -); -""" - -QUERY = """SELECT repository, project_name, project_version, -nb_uploads, uploaded_on, year, path -FROM pyprojects -""" - -INSERT_BACKEND = """INSERT INTO backends -VALUES (:repository, :project_name, :project_version, - :backend, :nb_uploads, :uploaded_on, :year, :path) -""" if __name__ == "__main__": start_time = time.time() - logging.basicConfig(filename='pyproject-backends.log', level=logging.INFO) + logging.basicConfig(filename="pyproject-backends.log", level=logging.INFO) # Create backend table # -------------------- + with open("create-table-backend.sql", "r") as f: + CREATE_BACKEND = f.read() + cnx_backend = sqlite3.connect("pyproject_backends.db") cur_backend = cnx_backend.cursor() @@ -78,37 +60,44 @@ if __name__ == "__main__": # Get project data # ---------------- - cnx_proj = sqlite3.connect('extract-pyproject-latest.db') + with open("query-projects.sql", "r") as f: + QUERY_PROJECTS = f.read() + + cnx_proj = sqlite3.connect("extract-pyproject-latest.db") cnx_proj.row_factory = sqlite3.Row cur_proj = cnx_proj.cursor() cur_proj.execute("SELECT COUNT(*) AS nb FROM pyprojects;") r = cur_proj.fetchone() - total = r['nb'] + total = r["nb"] cpt = 0 - for row in cur_proj.execute(QUERY): + for row in cur_proj.execute(QUERY_PROJECTS): values = { - "repository": row["repository"], - "project_name": row["project_name"], - "project_version": row["project_version"], - "nb_uploads": row["nb_uploads"], - "uploaded_on": row["uploaded_on"], - "year": row["year"], - "path": row["path"] - } + "repository": row["repository"], + "project_name": row["project_name"], + "project_version": row["project_version"], + "nb_uploads": row["nb_uploads"], + "uploaded_on": row["uploaded_on"], + "year": row["year"], + "path": row["path"], + } # Only fetch the pyproject.toml at the root of the project # -------------------------------------------------------- - parts = values['path'].split("/") + parts = values["path"].split("/") if len(parts) == 5 and parts[-1] == "pyproject.toml": - # Fetch the file data from the dataset # ------------------------------------ try: - data = pycodeorg.get_data(values['repository'], values['project_name'], values['path']) + data = pycodeorg.get_data( + values["repository"], values["project_name"], values["path"] + ) except ValueError as e: - LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (values['project_name'], e)) + LOG.error( + "pycodeorg.get_data failed to retrieve %s: '%s'" + % (values["project_name"], e) + ) continue # Then get the 'build-backend' value with a toml library @@ -116,29 +105,45 @@ if __name__ == "__main__": try: toml_dict = tomli.loads(data.decode()) except (tomli.TOMLDecodeError, UnicodeDecodeError) as e: - LOG.error("Error reading TOML file for %s: '%s'" % (values['project_name'], e)) + LOG.error( + "Error reading TOML file for %s: '%s'" % (values["project_name"], e) + ) continue # print(f"{toml_dict=}") - if toml_dict.get('build-system') and toml_dict['build-system'].get('build-backend'): - backend = toml_dict['build-system'].get('build-backend') + if toml_dict.get("build-system") and toml_dict["build-system"].get( + "build-backend" + ): + backend = toml_dict["build-system"].get("build-backend") - values['backend'] = backend + values["backend"] = backend print(f"{values['project_name']} : {values['backend']}") else: - values['backend'] = None + values["backend"] = None print(f"{values['project_name']} : .......... no backend found") try: - cur_backend.execute(INSERT_BACKEND, values) + cur_backend.execute( + """INSERT INTO backends + VALUES (:repository, :project_name, :project_version, + :backend, :nb_uploads, :uploaded_on, :year, :path) + """, + values, + ) cnx_backend.commit() except sqlite3.InterfaceError as e: - LOG.error("Error writing to sqlite3 for %s: '%s'" % (values['project_name'], e)) + LOG.error( + "Error writing to sqlite3 for %s: '%s'" + % (values["project_name"], e) + ) continue else: - LOG.info(f"%s is not a root path for %s" % (values['path'], values['project_name'])) + LOG.info( + f"%s is not a root path for %s" + % (values["path"], values["project_name"]) + ) cpt = cpt + 1 if cpt % 2500 == 0: @@ -153,4 +158,3 @@ if __name__ == "__main__": LOG.info(duration_msg) print(duration_msg) - diff --git a/query-projects.sql b/query-projects.sql new file mode 100644 index 0000000..39a726f --- /dev/null +++ b/query-projects.sql @@ -0,0 +1,18 @@ +-- Define the dialect +-- sqlfluff:dialect:sqlite + +-- Set a smaller indent for this file +-- sqlfluff:indentation:tab_space_size:2 + +-- Set keywords to be capitalised +-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper + +SELECT + repository, + project_name, + project_version, + nb_uploads, + uploaded_on, + year, + path +FROM pyprojects; -- GitLab