From 95a0825d974abf5cb909ecdb3777592885dd8d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7oise=20Conil?= <francoise.conil@insa-lyon.fr> Date: Thu, 4 Jul 2024 12:13:32 +0200 Subject: [PATCH] Import PyPI issues and associated events in a DuckDB database --- .../filtering-closed-events.sh | 13 ++ .../get-all-pypi-issue-events_gh.sh | 15 ++ .../create-issue-db/get-all-pypi-issues_gh.sh | 11 ++ .../create-issue-db/import-issues.sql | 107 +++++++++++ .../create-issue-db/import-with-gh-duckdb.md | 167 ++++++++++++++++++ .../create-issue-db/insert-closed-events.sh | 67 +++++++ 6 files changed, 380 insertions(+) create mode 100755 issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh create mode 100755 issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh create mode 100755 issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh create mode 100644 issues-with-duckdb_gh/create-issue-db/import-issues.sql create mode 100644 issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md create mode 100755 issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh diff --git a/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh b/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh new file mode 100755 index 0000000..a662571 --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# ls -a events | while read A +# do +# echo "Filtering closed event in events/$A" +# jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A" +# done + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Filtering closed event in events/pypi_issue_${inb}.json" + jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json" +done diff --git a/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh new file mode 100755 index 0000000..3d892e1 --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# select number from pypi_issues where state='closed' limit 5; +# gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues/4012/events" --paginate >pypi_issue_4012_events_gh.json + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Get $inb issues events"; + gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/pypi/support/issues/${inb}/events" \ + --paginate >"events/pypi_issue_${inb}.json" + sleep 0.2; +done diff --git a/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh new file mode 100755 index 0000000..d3609de --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28 +# https://docs.github.com/en/rest/issues/events?apiVersion=2022-11-28#list-issue-events +# +# https://cli.github.com/manual/gh_api +# --paginate +# Make additional HTTP requests to fetch all pages of results + +$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json + diff --git a/issues-with-duckdb_gh/create-issue-db/import-issues.sql b/issues-with-duckdb_gh/create-issue-db/import-issues.sql new file mode 100644 index 0000000..b4bdec3 --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/import-issues.sql @@ -0,0 +1,107 @@ +-- Define the dialect +-- sqlfluff:dialect:duckdb + +-- Set a smaller indent for this file +-- sqlfluff:indentation:tab_space_size:2 + +-- Set keywords to be capitalised +-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper + +-- https://duckdb.org/docs/data/json/overview +-- https://duckdb.org/docs/extensions/json +-- https://duckdb.org/2023/03/03/json + +DROP TABLE IF EXISTS pypi_issues; + +CREATE TABLE pypi_issues ( + id BIGINT PRIMARY KEY, + number BIGINT UNIQUE, + node_id VARCHAR, + state VARCHAR, + user_id BIGINT, + user_login VARCHAR, + user_node_id VARCHAR, + labels JSON, + created_at TIMESTAMP, + closed_at TIMESTAMP +); + +DROP TABLE IF EXISTS pypi_issues_labels; + +CREATE TABLE pypi_issues_labels ( + issue_id BIGINT, + label_id BIGINT, + label_node_id VARCHAR, + label_name VARCHAR, + FOREIGN KEY (issue_id) REFERENCES pypi_issues (id) +); + +DROP TABLE IF EXISTS pypi_issues_events; + +CREATE TABLE pypi_issues_events ( + issue_number BIGINT, + event_id BIGINT, + node_id VARCHAR, + actor_id BIGINT, + actor_login VARCHAR, + actor_node_id VARCHAR, + actor_type VARCHAR, + actor_site_admin VARCHAR, + event VARCHAR, + created_at TIMESTAMP, + state_reason VARCHAR, + FOREIGN KEY (issue_number) REFERENCES pypi_issues (number) +); + +INSERT INTO pypi_issues +( + SELECT + id, + number, + node_id, + state, + user.id, + user.login, + user.node_id, + labels, + created_at, + closed_at + FROM read_json( + 'pypi_issues_gh.json', + format = 'array', + columns = { + 'id': 'BIGINT', + 'number': 'BIGINT', + 'node_id': 'VARCHAR', + 'state': 'VARCHAR', + 'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)', + 'labels': 'STRUCT( + id BIGINT, + node_id VARCHAR, + url VARCHAR, + "name" VARCHAR, + color VARCHAR, + "default" BOOLEAN, + description VARCHAR + )[]', + 'created_at': 'TIMESTAMP', + 'closed_at': 'TIMESTAMP' + } + ) +); + +WITH q AS ( + SELECT + id, + unnest(labels) AS label + FROM 'pypi_issues_gh.json' +) + +INSERT INTO pypi_issues_labels ( + SELECT + id as issue_id, + label -> 'id' AS label_id, + label -> 'node_id' AS label_node_id, + label -> 'name' AS label_name + FROM q) +; diff --git a/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md b/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md new file mode 100644 index 0000000..e4bc666 --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md @@ -0,0 +1,167 @@ +# Import PyPI issues with gh and DuckDB + +**La base DuckDB créée est : pypi_issues.db**. + +Les issues importées avec `curl` ont des **doublons**, pourquoi ? Je n'ai pas +le temps de chercher. + +`gh` (GitHub CLI) a une option `--paginate` qui permet de requêter l'API et +récupérer toutes les pages. + +- [Quickstart for GitHub REST API](https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28) +- [List repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) + +> [gh api](https://cli.github.com/manual/gh_api) +> +> `--paginate` +> Make additional HTTP requests to fetch all pages of results + +Script `get-all-pypi-issues_gh.sh` : + +```shell +#!/bin/bash + +$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json +``` + +La base de données est créé et remplie par `import-issues.sql`, voir `.read +import-json.sql` ci-dessous. 3 tables sont créées : + +- `pypi_issues` : table principale +- `pypi_issues_labels` : table des labels associés aux issues +- `pypi_issues_events` : table des événements associés aux issues + +Avec le fichier json récupéré avec `gh`, on n'a pas d'issues dupliquées. On +peut déclarer l'`id` en `PRIMARY KEY`. + +``` +$ duckdb pypi_issues.db +v1.0.0 1f98600c2c +Enter ".help" for usage hints. +D .read import-json.sql +D .tables +pypi_issues +D select count(id) from pypi_issues; +┌───────────┠+│ count(id) │ +│ int64 │ +├───────────┤ +│ 4143 │ +└───────────┘ +D SELECT id, count(id) AS nb FROM pypi_issues GROUP BY id HAVING nb > 1; +┌───────┬───────┠+│ id │ nb │ +│ int64 │ int64 │ +├───────┴───────┤ +│ 0 rows │ +└───────────────┘ +D +``` + +## Récupérer les events + +Pas moyen de récupérer un event particulier, `closed`, pour une issue données +via l'API GitHub. + +On récupère donc tous les events de toutes les issues et on va trier. + +Script `get-all-pypi-issue-events_gh.sh` : + +```shell +#!/bin/bash + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Get $inb issues events"; + gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/pypi/support/issues/${inb}/events" \ + --paginate >"events/pypi_issue_${inb}.json" + sleep 0.2; +done +``` + +Filtrage des événements avec `jq`. + +```shell +< pypi_issue_3835.json jq '.[] | select(.event == "closed")' +``` + +Utilisé par le script `filtering-closed-events.sh`. + +```shell +#!/bin/bash + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Filtering closed event in events/pypi_issue_${inb}.json" + jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json" +done +``` + +Le script `insert-closed-events.sh` insère ensuite les événements `closed` dans +la BD. *On doit pouvoir combiner et faire beaucoup mieux*. + +```shell +#!/bin/bash + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Filtering closed event in events/pypi_issue_${inb}.json" + # format = 'array', \ + query="INSERT INTO pypi_issues_events ( \ + SELECT \ + ${inb}, \ + id, \ + node_id, \ + actor.id, \ + actor.login, \ + actor.node_id, \ + actor.type, \ + actor.site_admin, \ + event, \ + created_at, \ + state_reason \ + FROM read_json( \ + 'events_closed/pypi_issue_${inb}_closed.json', \ + columns = { \ + 'id': 'BIGINT', \ + 'node_id': 'VARCHAR', \ + 'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \ + 'state': 'VARCHAR', \ + 'event': 'VARCHAR', \ + 'created_at': 'TIMESTAMP', \ + 'state_reason': 'VARCHAR' \ + } \ + ) \ + );" + + # echo $query + duckdb pypi_issues.db -noheader -csv -c "${query}" +done +``` + +Cela ne semble pas cohérent entre le nombre d'événements `closed` et le nombre +d'issues `closed`. + +``` +$ duckdb pypi_issues.db +v1.0.0 1f98600c2c +Enter ".help" for usage hints. +D select count(event_id) from pypi_issues_events; +┌─────────────────┠+│ count(event_id) │ +│ int64 │ +├─────────────────┤ +│ 3365 │ +└─────────────────┘ +D select state, count(id) as nb from pypi_issues group by state; +┌─────────┬───────┠+│ state │ nb │ +│ varchar │ int64 │ +├─────────┼───────┤ +│ closed │ 3320 │ +│ open │ 823 │ +└─────────┴───────┘ +``` diff --git a/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh b/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh new file mode 100755 index 0000000..1e78dbc --- /dev/null +++ b/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# ls -a events | while read A +# do +# echo "Filtering closed event in events/$A" +# jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A" +# done + +# query="INSERT INTO pypi_issues_events ( +# > SELECT ${inb}, id, node_id, state, user.id, +# > user.login, user.node_id, user.type, +# > user.site_admin, event, created_at, state_reason +# > FROM read_json('events_closed/pypi_issue_${inb}_closed.json', +# > format = 'array', +# > columns = {'id': 'BIGINT', +# > 'number': 'BIGINT', +# > 'node_id': 'VARCHAR', +# > 'state': 'VARCHAR', +# > 'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)', +# > 'labels': 'STRUCT( +# > id BIGINT, +# > node_id VARCHAR, +# > url VARCHAR, +# > 'name' VARCHAR, +# > color VARCHAR, +# > 'default' BOOLEAN, +# > description VARCHAR +# > )[]', +# > 'created_at': 'TIMESTAMP', +# > 'closed_at': 'TIMESTAMP' +# > }) +# > );" + +duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb +do + echo "Filtering closed event in events/pypi_issue_${inb}.json" + # format = 'array', \ + query="INSERT INTO pypi_issues_events ( \ + SELECT \ + ${inb}, \ + id, \ + node_id, \ + actor.id, \ + actor.login, \ + actor.node_id, \ + actor.type, \ + actor.site_admin, \ + event, \ + created_at, \ + state_reason \ + FROM read_json( \ + 'events_closed/pypi_issue_${inb}_closed.json', \ + columns = { \ + 'id': 'BIGINT', \ + 'node_id': 'VARCHAR', \ + 'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \ + 'state': 'VARCHAR', \ + 'event': 'VARCHAR', \ + 'created_at': 'TIMESTAMP', \ + 'state_reason': 'VARCHAR' \ + } \ + ) \ + );" + + # echo $query + duckdb pypi_issues.db -noheader -csv -c "${query}" +done -- GitLab