diff --git a/plot-packaging-backends.py b/plot-packaging-backends.py index 869781c0dbd0c8cf0771571eb62c24cb919f6177..c441da0e504c0cd5426de1449005e6ed072773b0 100644 --- a/plot-packaging-backends.py +++ b/plot-packaging-backends.py @@ -55,7 +55,7 @@ if __name__ == "__main__": ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, str(nb), ha='center', va='bottom') # Set logarithmic scale on y-axis - # ax.set_yscale('log') + ax.set_yscale('log') # Color x-axis labels based on the condition for label, n in zip(ax.get_xticklabels(), backend_nb): @@ -63,7 +63,7 @@ if __name__ == "__main__": label.set_color('blue') # Adjust layout to prevent clipping of rotated labels - plt.tight_layout() + # plt.tight_layout() #plt.show() plt.savefig("python-backends-2018-2023.png", dpi=600) diff --git a/pyproject-sqlite-get-files-and-extract-backend.py b/pyproject-sqlite-get-files-and-extract-backend.py index 8a1e77596d46ec853057e3f05a3b7eb975ba0569..6d31a1510bed679c1921abcc7bbf3b0a6575be0b 100644 --- a/pyproject-sqlite-get-files-and-extract-backend.py +++ b/pyproject-sqlite-get-files-and-extract-backend.py @@ -36,6 +36,8 @@ sqlite> select DISTINCT project_name, project_version, nb_uploads, uploaded_on, import logging import sqlite3 import re +import time +import tomli import pycodeorg LOG = logging.getLogger(__name__) @@ -55,7 +57,6 @@ CREATE_BACKEND = """CREATE TABLE IF NOT EXISTS backends QUERY = """SELECT repository, project_name, project_version, nb_uploads, uploaded_on, year, path FROM pyprojects -WHERE year=2018 """ INSERT_BACKEND = """INSERT INTO backends @@ -64,7 +65,9 @@ VALUES (:repository, :project_name, :project_version, """ if __name__ == "__main__": - logging.basicConfig(filename='pyproject-backends.log', level=logging.ERROR) + start_time = time.time() + + logging.basicConfig(filename='pyproject-backends.log', level=logging.INFO) # Create backend table # -------------------- @@ -76,40 +79,78 @@ if __name__ == "__main__": # Get project data # ---------------- cnx_proj = sqlite3.connect('extract-pyproject-latest.db') + cnx_proj.row_factory = sqlite3.Row cur_proj = cnx_proj.cursor() + cur_proj.execute("SELECT COUNT(*) AS nb FROM pyprojects;") + r = cur_proj.fetchone() + total = r['nb'] + cpt = 0 + for row in cur_proj.execute(QUERY): - # print(row) - t_values = { - "repository": row[0], - "project_name": row[1], - "project_version": row[2], - "nb_uploads": row[3], - "uploaded_on": row[4], - "year": row[5], - "path": row[6] + values = { + "repository": row["repository"], + "project_name": row["project_name"], + "project_version": row["project_version"], + "nb_uploads": row["nb_uploads"], + "uploaded_on": row["uploaded_on"], + "year": row["year"], + "path": row["path"] } - # Fetch the file data from the dataset - # ------------------------------------ - try: - data = pycodeorg.get_data(t_values['repository'], t_values['project_name'], t_values['path']) - except ValueError as e: - LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (t_values['project_name'], e)) + # Only fetch the pyproject.toml at the root of the project + # -------------------------------------------------------- + parts = values['path'].split("/") + if len(parts) == 5 and parts[-1] == "pyproject.toml": + + # Fetch the file data from the dataset + # ------------------------------------ + try: + data = pycodeorg.get_data(values['repository'], values['project_name'], values['path']) + except ValueError as e: + LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (values['project_name'], e)) + continue + + # Then get the 'build-backend' value with a toml library + # ------------------------------------------------------ + try: + toml_dict = tomli.loads(data.decode()) + except (tomli.TOMLDecodeError, UnicodeDecodeError) as e: + LOG.error("Error reading TOML file for %s: '%s'" % (values['project_name'], e)) + continue + + # print(f"{toml_dict=}") + + if toml_dict.get('build-system') and toml_dict['build-system'].get('build-backend'): + backend = toml_dict['build-system'].get('build-backend') + + values['backend'] = backend + print(f"{values['project_name']} : {values['backend']}") + else: + values['backend'] = None + print(f"{values['project_name']} : .......... no backend found") + + try: + cur_backend.execute(INSERT_BACKEND, values) + cnx_backend.commit() + except sqlite3.InterfaceError as e: + LOG.error("Error writing to sqlite3 for %s: '%s'" % (values['project_name'], e)) + continue - # Then parse the 'build-backend' field and aggregate - # -------------------------------------------------- - if match := re.search(rb'\nbuild-backend\s*=\s*"([A-Za-z0-9-\.]+)"', data): - backend = match.group(1).decode() - - t_values['backend'] = backend - print(f"{t_values['project_name']} : {t_values['backend']}") else: - t_values['backend'] = None - print(f"{t_values['project_name']} : .......... no backend found") + LOG.info(f"%s is not a root path for %s" % (values['path'], values['project_name'])) - cur_backend.execute(INSERT_BACKEND, t_values) - cnx_backend.commit() + cpt = cpt + 1 + if cpt % 2500 == 0: + LOG.info("PROGRESS: %d / %d, %.2f %%" % (cpt, total, cpt * 100 / total)) cnx_proj.close() cnx_backend.close() + + end_time = time.time() + + duration_msg = f"Getting backends took : {end_time - start_time:0.3} seconds." + + LOG.info(duration_msg) + print(duration_msg) + diff --git a/python-backends-2018-2023-log-scale.png b/python-backends-2018-2023-log-scale.png index db4b08e06de725778085cd03002caa5bf96eba05..85c5c7ea95968132d6d9a930a5b69348d6ff5927 100644 Binary files a/python-backends-2018-2023-log-scale.png and b/python-backends-2018-2023-log-scale.png differ diff --git a/python-backends-2018-2023.png b/python-backends-2018-2023.png index 2db81c3afc7b519a128b94c766ac55c54184d206..08df20349101c9595df27f99850082d2e6101a5e 100644 Binary files a/python-backends-2018-2023.png and b/python-backends-2018-2023.png differ diff --git a/requirements.in b/requirements.in index 6c931d8426d72b2de8c62b8488c56c9315e65756..71cf1e7b996d202aa3239ea123d16495428a05d3 100644 --- a/requirements.in +++ b/requirements.in @@ -1,4 +1,5 @@ duckdb +tomli urllib3 matplotlib pandas diff --git a/requirements.txt b/requirements.txt index 10f1251d6f6dcf48d369c6f27645da81be3fd2bf..08dc80380a1b5f477f51fd0c2b00ee71d275ef8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,6 +37,8 @@ pytz==2023.3.post1 # via pandas six==1.16.0 # via python-dateutil +tomli==2.0.1 + # via -r requirements.in tzdata==2023.3 # via pandas urllib3==2.1.0