Skip to content
Snippets Groups Projects
Commit 0333f89b authored by Françoise Conil's avatar Françoise Conil
Browse files

First query and visualization test with duckdb and matplotlib

parent c8fed9c9
No related branches found
No related tags found
No related merge requests found
[sqlfluff]
dialect = duckdb
[sqlfluff:indentation]
tab_space_size = 4
[sqlfluff:rules:capitalisation.keywords]
capitalisation_policy = upper
-- How do I make duckbox output show more than 40 rows?
-- https://github.com/duckdb/duckdb/discussions/10562
-- Je pensais que c'était un paramètre de configuration :
-- https://duckdb.org/docs/configuration/overview
-- mais non c'est une "dot" commande du CLI
-- https://duckdb.org/docs/api/cli/dot_commands.html
-- qui ne serait donc pas disponible pour Python
-- https://stackoverflow.com/a/76860941
.maxrows 80
"""
https://matplotlib.org/2.0.2/users/pyplot_tutorial.html
https://matplotlib.org/2.0.2/examples/api/barchart_demo.html
"""
from pathlib import Path
import duckdb
import dotenv
# import pandas as pd
import matplotlib.pyplot as plt
def generate_date_tick(row):
"""Generate a month-date label for horizontal axis.
ATTENTION : row will be a Series with uniform data types
- if a column contains float64 and other columns are int64,
all row elements will be float64
- if a column contains strings (object), all row elements
will be object
See "Why data types was changed while calling the
apply function in Pandas?" : https://stackoverflow.com/a/40315866
:param _type_ row: A Pandas row with all columns
:return _type_: string
"""
return f"{row['cl_year']}-{row['cl_month']:02d}"
if __name__ == "__main__":
# Get environment variables in an OrderedDict
# https://github.com/theskumar/python-dotenv
env_vars = dotenv.dotenv_values()
# Build the path to the issues DuckDB database file
issues_db = Path(env_vars.get("PYPI_DB_PATH")) / env_vars.get("PYPI_DB_NAME")
# Connect to the file (no failure if nothing to expand)
cnx = duckdb.connect(str(issues_db.expanduser()), read_only=True)
# Load query from file
with open(env_vars.get("QUERY_ISSUES_STATS_BY_MONTH"), "r", encoding="utf-8") as f:
QUERY = f.read()
# Execute query and get a Pandas DataFrame back
# https://duckdb.org/docs/api/python/overview#result-conversion
df = cnx.sql(QUERY).df()
# Create a value for x tick labelling
df["x_tick"] = df.apply(generate_date_tick, axis=1) # , result_type="reduce")
# fig = plt.figure(num="PyPI Issues", figsize=(12, 8), dpi=600)
fig = plt.figure(num="PyPI Issues", figsize=(12, 8), layout="constrained")
ax = fig.add_subplot(1, 1, 1)
# https://matplotlib.org/stable/users/explain/colors/colors.html#colors-def
# https://matplotlib.org/stable/users/explain/colors/colormaps.html
ax.plot(
df["x_tick"],
df["nb_days_avg"],
color="b",
marker="o",
label="Number of days between opening and close",
)
ax.plot(
df["x_tick"],
df["closed_nb"],
color="g",
marker="o",
label="Issues closed by someone else than the opener",
)
ax.set_title(
"Statistics on PyPI issues, by month",
pad=10,
)
ax.legend()
# ~/Documents/DEVLOG/Evaluations/2024/Bilan-Formations/bilan-8-global-reseau.matplotlib.py
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
plt.show()
-c requirements.txt
ipython
pytest
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements-dev.in
#
asttokens==2.4.1
# via stack-data
decorator==5.1.1
# via ipython
exceptiongroup==1.2.1
# via
# ipython
# pytest
executing==2.0.1
# via stack-data
iniconfig==2.0.0
# via pytest
ipython==8.25.0
# via -r requirements-dev.in
jedi==0.19.1
# via ipython
matplotlib-inline==0.1.7
# via ipython
packaging==24.1
# via
# -c requirements.txt
# pytest
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
pluggy==1.5.0
# via pytest
prompt-toolkit==3.0.47
# via ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pygments==2.18.0
# via ipython
pytest==8.2.2
# via -r requirements-dev.in
six==1.16.0
# via
# -c requirements.txt
# asttokens
stack-data==0.6.3
# via ipython
tomli==2.0.1
# via pytest
traitlets==5.14.3
# via
# ipython
# matplotlib-inline
typing-extensions==4.12.2
# via ipython
wcwidth==0.2.13
# via prompt-toolkit
duckdb
matplotlib
pandas
python-dotenv
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements.in
#
contourpy==1.2.1
# via matplotlib
cycler==0.12.1
# via matplotlib
duckdb==1.0.0
# via -r requirements.in
fonttools==4.53.0
# via matplotlib
kiwisolver==1.4.5
# via matplotlib
matplotlib==3.9.0
# via -r requirements.in
numpy==1.26.4
# via
# contourpy
# matplotlib
# pandas
packaging==24.1
# via matplotlib
pandas==2.2.2
# via -r requirements.in
pillow==10.3.0
# via matplotlib
pyparsing==3.1.2
# via matplotlib
python-dateutil==2.9.0.post0
# via
# matplotlib
# pandas
python-dotenv==1.0.1
# via -r requirements.in
pytz==2024.1
# via pandas
six==1.16.0
# via python-dateutil
tzdata==2024.1
# via pandas
-- Compter les issues "PEP 541" fermées par d'autres personnes que celles qui
-- les ont ouvertes. Par mois
SELECT
year(i.closed_at) AS cl_year,
month(i.closed_at) AS cl_month,
cast(round(avg(datediff('day', i.created_at, i.closed_at)), 0) AS BIGINT) AS nb_days_avg,
count(*) AS closed_nb
FROM
pypi_issues AS i
INNER JOIN pypi_issues_events AS e
ON i.number = e.issue_number
INNER JOIN pypi_issues_labels AS l
ON l.issue_id = i.id
WHERE
i.state = 'closed'
AND i.user_login != e.actor_login
GROUP BY
cl_year, cl_month
ORDER BY
cl_year, cl_month;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment