Skip to content
Snippets Groups Projects
Commit 1f436976 authored by Françoise Conil's avatar Françoise Conil
Browse files

Get latest package pyproject.toml files

Get the latest package in the 2018-2023 range.
Search for backend in the pyproject.toml file (more than half
pyproject.toml files do not have "build-backend" lines)
parents
No related branches found
No related tags found
No related merge requests found
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# Testing tools
.pytest_cache/
# Distribution / packaging
./build/
dist/
# Environments
.venv
venv/
# IDE specific
*~
.vscode/settings.json
# OS specific
*.DS_Store
"""
https://docs.python.org/3.9/library/sqlite3.html
https://www.sqlite.org/uri.html : file path, URI and read-only mode
https://matplotlib.org/2.0.2/users/pyplot_tutorial.html
https://matplotlib.org/2.0.2/examples/api/barchart_demo.html
"""
from pathlib import Path
import sqlite3
import matplotlib.pyplot as plt
BACKEND_PATH = Path("~/Progs/python/duckdb/pyproject_backends.db")
QUERY = "SELECT * FROM backends"
if __name__ == "__main__":
cnx = sqlite3.connect(f"file:{str(BACKEND_PATH.expanduser())}?mode=ro")
cnx.row_factory = sqlite3.Row
cur = cnx.cursor()
cur.execute("SELECT COUNT(*) AS nb FROM backends WHERE backend IS NOT NULL;")
r = cur.fetchone()
backends_total = r['nb']
cur.execute("SELECT backend, COUNT(backend) AS nb FROM backends WHERE backend IS NOT NULL GROUP BY backend;")
r = cur.fetchall()
backends = [ t[0] for t in r ]
backend_nb = [ t[1] for t in r ]
cnx.close()
fig = plt.figure(num=f'{backends_total} backends used in pyproject.toml on PyPI (2018-2023)', figsize=(12,8))
ax = fig.add_subplot(1, 1, 1)
bars = ax.bar(backends, backend_nb, color=['blue' if n > 500 else 'cyan' for n in backend_nb])
ax.set_xlabel("Backend")
ax.set_ylabel("Times used")
"""
TODO : .../plot-packaging-backends.py:62:
UserWarning: set_ticklabels() should only be used with a fixed number of
ticks, i.e. after set_ticks() or using a FixedLocator.
"""
# Rotating x-axis labels for better readability
ax.set_xticklabels(backends, rotation=70, ha='right')
# Add y value labels above the bars
for bar, nb in zip(bars, backend_nb):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, str(nb), ha='center', va='bottom')
# Set logarithmic scale on y-axis
# ax.set_yscale('log')
# Color x-axis labels based on the condition
for label, n in zip(ax.get_xticklabels(), backend_nb):
if n > 500:
label.set_color('blue')
# Adjust layout to prevent clipping of rotated labels
plt.tight_layout()
#plt.show()
plt.savefig("python-backends-2018-2023.png", dpi=600)
"""
https://sethmlarson.dev/security-developer-in-residence-weekly-report-18
https://duckdb.org/docs/sql/query_syntax/with
https://duckdb.org/docs/sql/functions/timestamp
https://duckdb.org/docs/sql/aggregates
https://duckdb.org/docs/guides/python/execute_sql
"""
import duckdb
ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
MAX(project_version) AS max_version,
LIST(DISTINCT project_version) AS all_versions,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on,
LIST(DISTINCT repository) AS all_repository,
LIST(DISTINCT path) AS all_path
FROM '*.parquet'
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
GROUP BY project_name;
"""
res = duckdb.sql(ALL_VERSIONS_QUERY)
res.to_csv("extract-pyproject-all-versions.csv", header=True)
LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
MAX(uploaded_on) AS max_uploaded_on,
LIST(DISTINCT uploaded_on) AS all_uploaded_on
FROM '*.parquet'
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
GROUP BY project_name)
SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads,
ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
FROM '*.parquet' as ip
JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
"""
# res = duckdb.sql(LATEST_QUERY).show()
res = duckdb.sql(LATEST_QUERY)
res.to_csv("extract-pyproject-latest.csv", header=True)
"""
https://docs.python.org/3.9/library/sqlite3.html
https://www.sqlite.org/lang_datefunc.html
$ sqlite3 extract-pyproject-latest.db
sqlite> .schema
CREATE TABLE pyprojects (
repository TEXT,
project_name TEXT,
project_version TEXT,
nb_uploads INTEGER,
uploaded_on TEXT,
year INTEGER,
path TEXT
);
sqlite> .mode table
sqlite> select DISTINCT project_name, project_version, nb_uploads, uploaded_on, year from pyprojects order by nb_uploads desc limit 10;
+-------------------------------+-----------------+------------+-------------------------+------+
| project_name | project_version | nb_uploads | uploaded_on | year |
+-------------------------------+-----------------+------------+-------------------------+------+
| OZI | 0.0.219 | 7304 | 2023-11-24 23:59:27.615 | 2023 |
| teamhack-nmap | 0.0.4328 | 4140 | 2023-11-21 07:13:54.906 | 2023 |
| ddtrace | 2.3.1 | 3804 | 2023-11-22 20:07:23.221 | 2023 |
| pdm | 2.10.4 | 1672 | 2023-11-24 01:43:35.052 | 2023 |
| cdktf-cdktf-provider-newrelic | 11.0.4 | 1657 | 2023-11-09 03:18:09.907 | 2023 |
| utilmy | 0.1.17009007 | 1500 | 2023-11-25 08:26:46.145 | 2023 |
| poetry-core | 1.8.1 | 1440 | 2023-10-31 16:03:40.219 | 2023 |
| poetry | 1.7.1 | 1409 | 2023-11-16 19:09:08.238 | 2023 |
| pepperize.cdk-organizations | 0.7.742 | 1381 | 2023-11-23 00:16:04.116 | 2023 |
| coiled | 1.1.16.dev4 | 1374 | 2023-11-23 19:49:42.64 | 2023 |
+-------------------------------+-----------------+------------+-------------------------+------+
"""
import logging
import sqlite3
import re
import pycodeorg
LOG = logging.getLogger(__name__)
CREATE_BACKEND = """CREATE TABLE IF NOT EXISTS backends
(repository TEXT,
project_name TEXT,
project_version TEXT,
backend TEXT,
nb_uploads INTEGER,
uploaded_on TEXT,
year INTEGER,
path TEXT
);
"""
QUERY = """SELECT repository, project_name, project_version,
nb_uploads, uploaded_on, year, path
FROM pyprojects
WHERE year=2018
"""
INSERT_BACKEND = """INSERT INTO backends
VALUES (:repository, :project_name, :project_version,
:backend, :nb_uploads, :uploaded_on, :year, :path)
"""
if __name__ == "__main__":
logging.basicConfig(filename='pyproject-backends.log', level=logging.ERROR)
# Create backend table
# --------------------
cnx_backend = sqlite3.connect("pyproject_backends.db")
cur_backend = cnx_backend.cursor()
cur_backend.execute(CREATE_BACKEND)
# Get project data
# ----------------
cnx_proj = sqlite3.connect('extract-pyproject-latest.db')
cur_proj = cnx_proj.cursor()
for row in cur_proj.execute(QUERY):
# print(row)
t_values = {
"repository": row[0],
"project_name": row[1],
"project_version": row[2],
"nb_uploads": row[3],
"uploaded_on": row[4],
"year": row[5],
"path": row[6]
}
# Fetch the file data from the dataset
# ------------------------------------
try:
data = pycodeorg.get_data(t_values['repository'], t_values['project_name'], t_values['path'])
except ValueError as e:
LOG.error("pycodeorg.get_data failed to retrieve %s: '%s'" % (t_values['project_name'], e))
# Then parse the 'build-backend' field and aggregate
# --------------------------------------------------
if match := re.search(rb'\nbuild-backend\s*=\s*"([A-Za-z0-9-\.]+)"', data):
backend = match.group(1).decode()
t_values['backend'] = backend
print(f"{t_values['project_name']} : {t_values['backend']}")
else:
t_values['backend'] = None
print(f"{t_values['project_name']} : .......... no backend found")
cur_backend.execute(INSERT_BACKEND, t_values)
cnx_backend.commit()
cnx_proj.close()
cnx_backend.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment