Skip to content
Snippets Groups Projects
Commit 95a0825d authored by Françoise Conil's avatar Françoise Conil
Browse files

Import PyPI issues and associated events in a DuckDB database

parent df4141ec
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
# ls -a events | while read A
# do
# echo "Filtering closed event in events/$A"
# jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A"
# done
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Filtering closed event in events/pypi_issue_${inb}.json"
jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json"
done
#!/bin/bash
# select number from pypi_issues where state='closed' limit 5;
# gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues/4012/events" --paginate >pypi_issue_4012_events_gh.json
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Get $inb issues events";
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/pypi/support/issues/${inb}/events" \
--paginate >"events/pypi_issue_${inb}.json"
sleep 0.2;
done
#!/bin/bash
# https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28
# https://docs.github.com/en/rest/issues/events?apiVersion=2022-11-28#list-issue-events
#
# https://cli.github.com/manual/gh_api
# --paginate
# Make additional HTTP requests to fetch all pages of results
$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json
-- Define the dialect
-- sqlfluff:dialect:duckdb
-- Set a smaller indent for this file
-- sqlfluff:indentation:tab_space_size:2
-- Set keywords to be capitalised
-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
-- https://duckdb.org/docs/data/json/overview
-- https://duckdb.org/docs/extensions/json
-- https://duckdb.org/2023/03/03/json
DROP TABLE IF EXISTS pypi_issues;
CREATE TABLE pypi_issues (
id BIGINT PRIMARY KEY,
number BIGINT UNIQUE,
node_id VARCHAR,
state VARCHAR,
user_id BIGINT,
user_login VARCHAR,
user_node_id VARCHAR,
labels JSON,
created_at TIMESTAMP,
closed_at TIMESTAMP
);
DROP TABLE IF EXISTS pypi_issues_labels;
CREATE TABLE pypi_issues_labels (
issue_id BIGINT,
label_id BIGINT,
label_node_id VARCHAR,
label_name VARCHAR,
FOREIGN KEY (issue_id) REFERENCES pypi_issues (id)
);
DROP TABLE IF EXISTS pypi_issues_events;
CREATE TABLE pypi_issues_events (
issue_number BIGINT,
event_id BIGINT,
node_id VARCHAR,
actor_id BIGINT,
actor_login VARCHAR,
actor_node_id VARCHAR,
actor_type VARCHAR,
actor_site_admin VARCHAR,
event VARCHAR,
created_at TIMESTAMP,
state_reason VARCHAR,
FOREIGN KEY (issue_number) REFERENCES pypi_issues (number)
);
INSERT INTO pypi_issues
(
SELECT
id,
number,
node_id,
state,
user.id,
user.login,
user.node_id,
labels,
created_at,
closed_at
FROM read_json(
'pypi_issues_gh.json',
format = 'array',
columns = {
'id': 'BIGINT',
'number': 'BIGINT',
'node_id': 'VARCHAR',
'state': 'VARCHAR',
'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)',
'labels': 'STRUCT(
id BIGINT,
node_id VARCHAR,
url VARCHAR,
"name" VARCHAR,
color VARCHAR,
"default" BOOLEAN,
description VARCHAR
)[]',
'created_at': 'TIMESTAMP',
'closed_at': 'TIMESTAMP'
}
)
);
WITH q AS (
SELECT
id,
unnest(labels) AS label
FROM 'pypi_issues_gh.json'
)
INSERT INTO pypi_issues_labels (
SELECT
id as issue_id,
label -> 'id' AS label_id,
label -> 'node_id' AS label_node_id,
label -> 'name' AS label_name
FROM q)
;
# Import PyPI issues with gh and DuckDB
**La base DuckDB créée est : pypi_issues.db**.
Les issues importées avec `curl` ont des **doublons**, pourquoi ? Je n'ai pas
le temps de chercher.
`gh` (GitHub CLI) a une option `--paginate` qui permet de requêter l'API et
récupérer toutes les pages.
- [Quickstart for GitHub REST API](https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28)
- [List repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues)
> [gh api](https://cli.github.com/manual/gh_api)
>
> `--paginate`
> Make additional HTTP requests to fetch all pages of results
Script `get-all-pypi-issues_gh.sh` :
```shell
#!/bin/bash
$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json
```
La base de données est créé et remplie par `import-issues.sql`, voir `.read
import-json.sql` ci-dessous. 3 tables sont créées :
- `pypi_issues` : table principale
- `pypi_issues_labels` : table des labels associés aux issues
- `pypi_issues_events` : table des événements associés aux issues
Avec le fichier json récupéré avec `gh`, on n'a pas d'issues dupliquées. On
peut déclarer l'`id` en `PRIMARY KEY`.
```
$ duckdb pypi_issues.db
v1.0.0 1f98600c2c
Enter ".help" for usage hints.
D .read import-json.sql
D .tables
pypi_issues
D select count(id) from pypi_issues;
┌───────────┐
│ count(id) │
│ int64 │
├───────────┤
│ 4143 │
└───────────┘
D SELECT id, count(id) AS nb FROM pypi_issues GROUP BY id HAVING nb > 1;
┌───────┬───────┐
│ id │ nb │
│ int64 │ int64 │
├───────┴───────┤
│ 0 rows │
└───────────────┘
D
```
## Récupérer les events
Pas moyen de récupérer un event particulier, `closed`, pour une issue données
via l'API GitHub.
On récupère donc tous les events de toutes les issues et on va trier.
Script `get-all-pypi-issue-events_gh.sh` :
```shell
#!/bin/bash
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Get $inb issues events";
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/pypi/support/issues/${inb}/events" \
--paginate >"events/pypi_issue_${inb}.json"
sleep 0.2;
done
```
Filtrage des événements avec `jq`.
```shell
< pypi_issue_3835.json jq '.[] | select(.event == "closed")'
```
Utilisé par le script `filtering-closed-events.sh`.
```shell
#!/bin/bash
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Filtering closed event in events/pypi_issue_${inb}.json"
jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json"
done
```
Le script `insert-closed-events.sh` insère ensuite les événements `closed` dans
la BD. *On doit pouvoir combiner et faire beaucoup mieux*.
```shell
#!/bin/bash
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Filtering closed event in events/pypi_issue_${inb}.json"
# format = 'array', \
query="INSERT INTO pypi_issues_events ( \
SELECT \
${inb}, \
id, \
node_id, \
actor.id, \
actor.login, \
actor.node_id, \
actor.type, \
actor.site_admin, \
event, \
created_at, \
state_reason \
FROM read_json( \
'events_closed/pypi_issue_${inb}_closed.json', \
columns = { \
'id': 'BIGINT', \
'node_id': 'VARCHAR', \
'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \
'state': 'VARCHAR', \
'event': 'VARCHAR', \
'created_at': 'TIMESTAMP', \
'state_reason': 'VARCHAR' \
} \
) \
);"
# echo $query
duckdb pypi_issues.db -noheader -csv -c "${query}"
done
```
Cela ne semble pas cohérent entre le nombre d'événements `closed` et le nombre
d'issues `closed`.
```
$ duckdb pypi_issues.db
v1.0.0 1f98600c2c
Enter ".help" for usage hints.
D select count(event_id) from pypi_issues_events;
┌─────────────────┐
│ count(event_id) │
│ int64 │
├─────────────────┤
│ 3365 │
└─────────────────┘
D select state, count(id) as nb from pypi_issues group by state;
┌─────────┬───────┐
│ state │ nb │
│ varchar │ int64 │
├─────────┼───────┤
│ closed │ 3320 │
│ open │ 823 │
└─────────┴───────┘
```
#!/bin/bash
# ls -a events | while read A
# do
# echo "Filtering closed event in events/$A"
# jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A"
# done
# query="INSERT INTO pypi_issues_events (
# > SELECT ${inb}, id, node_id, state, user.id,
# > user.login, user.node_id, user.type,
# > user.site_admin, event, created_at, state_reason
# > FROM read_json('events_closed/pypi_issue_${inb}_closed.json',
# > format = 'array',
# > columns = {'id': 'BIGINT',
# > 'number': 'BIGINT',
# > 'node_id': 'VARCHAR',
# > 'state': 'VARCHAR',
# > 'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)',
# > 'labels': 'STRUCT(
# > id BIGINT,
# > node_id VARCHAR,
# > url VARCHAR,
# > 'name' VARCHAR,
# > color VARCHAR,
# > 'default' BOOLEAN,
# > description VARCHAR
# > )[]',
# > 'created_at': 'TIMESTAMP',
# > 'closed_at': 'TIMESTAMP'
# > })
# > );"
duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
do
echo "Filtering closed event in events/pypi_issue_${inb}.json"
# format = 'array', \
query="INSERT INTO pypi_issues_events ( \
SELECT \
${inb}, \
id, \
node_id, \
actor.id, \
actor.login, \
actor.node_id, \
actor.type, \
actor.site_admin, \
event, \
created_at, \
state_reason \
FROM read_json( \
'events_closed/pypi_issue_${inb}_closed.json', \
columns = { \
'id': 'BIGINT', \
'node_id': 'VARCHAR', \
'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \
'state': 'VARCHAR', \
'event': 'VARCHAR', \
'created_at': 'TIMESTAMP', \
'state_reason': 'VARCHAR' \
} \
) \
);"
# echo $query
duckdb pypi_issues.db -noheader -csv -c "${query}"
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment