Skip to content
Snippets Groups Projects
Commit bb2f9c1a authored by Françoise Conil's avatar Françoise Conil
Browse files

Light refactoring

Extract the duckdb and sqlite queries from the code.
parent c8fb64da
No related branches found
No related tags found
No related merge requests found
-- Define the dialect
-- sqlfluff:dialect:sqlite
-- Set a smaller indent for this file
-- sqlfluff:indentation:tab_space_size:2
-- Set keywords to be capitalised
-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
CREATE TABLE IF NOT EXISTS backends
(
repository TEXT,
project_name TEXT,
project_version TEXT,
backend TEXT,
nb_uploads INTEGER,
uploaded_on TEXT,
year INTEGER,
path TEXT
);
-- Define the dialect
-- sqlfluff:dialect:duckdb
-- Set a smaller indent for this file
-- sqlfluff:indentation:tab_space_size:2
-- Set keywords to be capitalised
-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
SELECT
project_name,
COUNT(project_name) AS nb_uploads,
MAX(project_version) AS max_version,
LIST(DISTINCT project_version) AS all_versions,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on,
LIST(DISTINCT repository) AS all_repository,
LIST(DISTINCT path) AS all_path
FROM '*.parquet'
WHERE
(DATE_PART('year', uploaded_on) >= '2018')
AND REGEXP_MATCHES(path, 'pyproject.toml$')
AND skip_reason = ''
GROUP BY project_name;
-- Define the dialect
-- sqlfluff:dialect:duckdb
-- Set a smaller indent for this file
-- sqlfluff:indentation:tab_space_size:2
-- Set keywords to be capitalised
-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
WITH lpv AS (
SELECT
project_name,
COUNT(project_name) AS nb_uploads,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on
FROM '*.parquet'
WHERE
(DATE_PART('year', uploaded_on) >= '2018')
AND REGEXP_MATCHES(path, 'pyproject.toml$')
AND skip_reason = ''
GROUP BY project_name
)
SELECT
ip.repository,
ip.project_name,
ip.project_version,
lpv.nb_uploads,
ip.uploaded_on,
DATE_PART('year', ip.uploaded_on) AS year,
ip.path
FROM '*.parquet' AS ip
JOIN
lpv
ON ip.project_name = lpv.project_name AND ip.uploaded_on = lpv.max_uploaded_on
WHERE REGEXP_MATCHES(path, 'pyproject.toml$') AND skip_reason = '';
......@@ -9,33 +9,15 @@ https://duckdb.org/docs/guides/python/execute_sql
import duckdb
ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
MAX(project_version) AS max_version,
LIST(DISTINCT project_version) AS all_versions,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on,
LIST(DISTINCT repository) AS all_repository,
LIST(DISTINCT path) AS all_path
FROM '*.parquet'
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
GROUP BY project_name;
"""
with open('extract-all-projects-versions.sql', 'r') as f:
ALL_VERSIONS_QUERY = f.read()
res = duckdb.sql(ALL_VERSIONS_QUERY)
res.to_csv("extract-pyproject-all-versions.csv", header=True)
LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on
FROM '*.parquet'
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
GROUP BY project_name)
SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads,
ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
FROM '*.parquet' as ip
JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
"""
with open('extract-latest-project-version.sql', 'r') as f:
LATEST_QUERY = f.read()
# res = duckdb.sql(LATEST_QUERY).show()
......
......@@ -42,35 +42,17 @@ import pycodeorg
LOG = logging.getLogger(__name__)
CREATE_BACKEND = """CREATE TABLE IF NOT EXISTS backends
(repository TEXT,
project_name TEXT,
project_version TEXT,
backend TEXT,
nb_uploads INTEGER,
uploaded_on TEXT,
year INTEGER,
path TEXT
);
"""
QUERY = """SELECT repository, project_name, project_version,
nb_uploads, uploaded_on, year, path
FROM pyprojects
"""
INSERT_BACKEND = """INSERT INTO backends
VALUES (:repository, :project_name, :project_version,
:backend, :nb_uploads, :uploaded_on, :year, :path)
"""
if __name__ == "__main__":
start_time = time.time()
logging.basicConfig(filename='pyproject-backends.log', level=logging.INFO)
logging.basicConfig(filename="pyproject-backends.log", level=logging.INFO)
# Create backend table
# --------------------
with open("create-table-backend.sql", "r") as f:
CREATE_BACKEND = f.read()
cnx_backend = sqlite3.connect("pyproject_backends.db")
cur_backend = cnx_backend.cursor()
......@@ -78,37 +60,44 @@ if __name__ == "__main__":
# Get project data
# ----------------
cnx_proj = sqlite3.connect('extract-pyproject-latest.db')
with open("query-projects.sql", "r") as f:
QUERY_PROJECTS = f.read()
cnx_proj = sqlite3.connect("extract-pyproject-latest.db")
cnx_proj.row_factory = sqlite3.Row
cur_proj = cnx_proj.cursor()
cur_proj.execute("SELECT COUNT(*) AS nb FROM pyprojects;")
r = cur_proj.fetchone()
total = r['nb']
total = r["nb"]
cpt = 0
for row in cur_proj.execute(QUERY):
for row in cur_proj.execute(QUERY_PROJECTS):
values = {
"repository": row["repository"],
"project_name": row["project_name"],
"project_version": row["project_version"],
"nb_uploads": row["nb_uploads"],
"uploaded_on": row["uploaded_on"],
"year": row["year"],
"path": row["path"]
}
"repository": row["repository"],
"project_name": row["project_name"],
"project_version": row["project_version"],
"nb_uploads": row["nb_uploads"],
"uploaded_on": row["uploaded_on"],
"year": row["year"],
"path": row["path"],
}
# Only fetch the pyproject.toml at the root of the project
# --------------------------------------------------------
parts = values['path'].split("/")
parts = values["path"].split("/")
if len(parts) == 5 and parts[-1] == "pyproject.toml":
# Fetch the file data from the dataset
# ------------------------------------
try:
data = pycodeorg.get_data(values['repository'], values['project_name'], values['path'])
data = pycodeorg.get_data(
values["repository"], values["project_name"], values["path"]
)
except ValueError as e:
LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (values['project_name'], e))
LOG.error(
"pycodeorg.get_data failed to retrieve %s: '%s'"
% (values["project_name"], e)
)
continue
# Then get the 'build-backend' value with a toml library
......@@ -116,29 +105,45 @@ if __name__ == "__main__":
try:
toml_dict = tomli.loads(data.decode())
except (tomli.TOMLDecodeError, UnicodeDecodeError) as e:
LOG.error("Error reading TOML file for %s: '%s'" % (values['project_name'], e))
LOG.error(
"Error reading TOML file for %s: '%s'" % (values["project_name"], e)
)
continue
# print(f"{toml_dict=}")
if toml_dict.get('build-system') and toml_dict['build-system'].get('build-backend'):
backend = toml_dict['build-system'].get('build-backend')
if toml_dict.get("build-system") and toml_dict["build-system"].get(
"build-backend"
):
backend = toml_dict["build-system"].get("build-backend")
values['backend'] = backend
values["backend"] = backend
print(f"{values['project_name']} : {values['backend']}")
else:
values['backend'] = None
values["backend"] = None
print(f"{values['project_name']} : .......... no backend found")
try:
cur_backend.execute(INSERT_BACKEND, values)
cur_backend.execute(
"""INSERT INTO backends
VALUES (:repository, :project_name, :project_version,
:backend, :nb_uploads, :uploaded_on, :year, :path)
""",
values,
)
cnx_backend.commit()
except sqlite3.InterfaceError as e:
LOG.error("Error writing to sqlite3 for %s: '%s'" % (values['project_name'], e))
LOG.error(
"Error writing to sqlite3 for %s: '%s'"
% (values["project_name"], e)
)
continue
else:
LOG.info(f"%s is not a root path for %s" % (values['path'], values['project_name']))
LOG.info(
f"%s is not a root path for %s"
% (values["path"], values["project_name"])
)
cpt = cpt + 1
if cpt % 2500 == 0:
......@@ -153,4 +158,3 @@ if __name__ == "__main__":
LOG.info(duration_msg)
print(duration_msg)
-- Define the dialect
-- sqlfluff:dialect:sqlite
-- Set a smaller indent for this file
-- sqlfluff:indentation:tab_space_size:2
-- Set keywords to be capitalised
-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
SELECT
repository,
project_name,
project_version,
nb_uploads,
uploaded_on,
year,
path
FROM pyprojects;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment