From 95a0825d974abf5cb909ecdb3777592885dd8d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7oise=20Conil?= <francoise.conil@insa-lyon.fr>
Date: Thu, 4 Jul 2024 12:13:32 +0200
Subject: [PATCH] Import PyPI issues and associated events in a DuckDB database

---
 .../filtering-closed-events.sh                |  13 ++
 .../get-all-pypi-issue-events_gh.sh           |  15 ++
 .../create-issue-db/get-all-pypi-issues_gh.sh |  11 ++
 .../create-issue-db/import-issues.sql         | 107 +++++++++++
 .../create-issue-db/import-with-gh-duckdb.md  | 167 ++++++++++++++++++
 .../create-issue-db/insert-closed-events.sh   |  67 +++++++
 6 files changed, 380 insertions(+)
 create mode 100755 issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh
 create mode 100755 issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh
 create mode 100755 issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh
 create mode 100644 issues-with-duckdb_gh/create-issue-db/import-issues.sql
 create mode 100644 issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md
 create mode 100755 issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh

diff --git a/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh b/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh
new file mode 100755
index 0000000..a662571
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/filtering-closed-events.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# ls -a events | while read A
+# do
+#     echo "Filtering closed event in events/$A"
+#     jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A"
+# done
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Filtering closed event in events/pypi_issue_${inb}.json"
+    jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json"
+done
diff --git a/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh
new file mode 100755
index 0000000..3d892e1
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issue-events_gh.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# select number from pypi_issues where state='closed' limit 5;
+# gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues/4012/events" --paginate >pypi_issue_4012_events_gh.json
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Get $inb issues events";
+    gh api \
+        -H "Accept: application/vnd.github+json" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        "/repos/pypi/support/issues/${inb}/events" \
+        --paginate >"events/pypi_issue_${inb}.json"
+    sleep 0.2;
+done
diff --git a/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh
new file mode 100755
index 0000000..d3609de
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/get-all-pypi-issues_gh.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28
+# https://docs.github.com/en/rest/issues/events?apiVersion=2022-11-28#list-issue-events
+#
+# https://cli.github.com/manual/gh_api
+# --paginate 
+#   Make additional HTTP requests to fetch all pages of results
+
+$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json
+
diff --git a/issues-with-duckdb_gh/create-issue-db/import-issues.sql b/issues-with-duckdb_gh/create-issue-db/import-issues.sql
new file mode 100644
index 0000000..b4bdec3
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/import-issues.sql
@@ -0,0 +1,107 @@
+-- Define the dialect
+-- sqlfluff:dialect:duckdb
+
+-- Set a smaller indent for this file
+-- sqlfluff:indentation:tab_space_size:2
+
+-- Set keywords to be capitalised
+-- sqlfluff:rules:capitalisation.keywords:capitalisation_policy:upper
+
+-- https://duckdb.org/docs/data/json/overview
+-- https://duckdb.org/docs/extensions/json
+-- https://duckdb.org/2023/03/03/json
+
+DROP TABLE IF EXISTS pypi_issues;
+
+CREATE TABLE pypi_issues (
+  id BIGINT PRIMARY KEY,
+  number BIGINT UNIQUE,
+  node_id VARCHAR,
+  state VARCHAR,
+  user_id BIGINT,
+  user_login VARCHAR,
+  user_node_id VARCHAR,
+  labels JSON,
+  created_at TIMESTAMP,
+  closed_at TIMESTAMP
+);
+
+DROP TABLE IF EXISTS pypi_issues_labels;
+
+CREATE TABLE pypi_issues_labels (
+  issue_id BIGINT,
+  label_id BIGINT,
+  label_node_id VARCHAR,
+  label_name VARCHAR,
+  FOREIGN KEY (issue_id) REFERENCES pypi_issues (id)
+);
+
+DROP TABLE IF EXISTS pypi_issues_events;
+
+CREATE TABLE pypi_issues_events (
+  issue_number BIGINT,
+  event_id BIGINT,
+  node_id VARCHAR,
+  actor_id BIGINT,
+  actor_login VARCHAR,
+  actor_node_id VARCHAR,
+  actor_type VARCHAR,
+  actor_site_admin VARCHAR,
+  event VARCHAR,
+  created_at TIMESTAMP,
+  state_reason VARCHAR,
+  FOREIGN KEY (issue_number) REFERENCES pypi_issues (number)
+);
+
+INSERT INTO pypi_issues
+(
+  SELECT
+    id,
+    number,
+    node_id,
+    state,
+    user.id,
+    user.login,
+    user.node_id,
+    labels,
+    created_at,
+    closed_at
+  FROM read_json(
+    'pypi_issues_gh.json',
+    format = 'array',
+    columns = {
+      'id': 'BIGINT',
+      'number': 'BIGINT',
+      'node_id': 'VARCHAR',
+      'state': 'VARCHAR',
+      'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)',
+      'labels': 'STRUCT(
+                   id BIGINT,
+                   node_id VARCHAR,
+                   url VARCHAR,
+                   "name" VARCHAR,
+                   color VARCHAR,
+                   "default" BOOLEAN,
+                   description VARCHAR
+                   )[]',
+      'created_at': 'TIMESTAMP',
+      'closed_at': 'TIMESTAMP'
+    }
+  )
+);
+
+WITH q AS (
+  SELECT
+    id,
+    unnest(labels) AS label
+  FROM 'pypi_issues_gh.json'
+)
+
+INSERT INTO pypi_issues_labels (
+  SELECT
+    id as issue_id,
+    label -> 'id' AS label_id,
+    label -> 'node_id' AS label_node_id,
+    label -> 'name' AS label_name
+  FROM q)
+;
diff --git a/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md b/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md
new file mode 100644
index 0000000..e4bc666
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/import-with-gh-duckdb.md
@@ -0,0 +1,167 @@
+# Import PyPI issues with gh and DuckDB
+
+**La base DuckDB créée est : pypi_issues.db**.
+
+Les issues importées avec `curl` ont des **doublons**, pourquoi ? Je n'ai pas
+le temps de chercher.
+
+`gh` (GitHub CLI) a une option `--paginate` qui permet de requêter l'API et
+récupérer toutes les pages.
+
+- [Quickstart for GitHub REST API](https://docs.github.com/en/rest/quickstart?apiVersion=2022-11-28)
+- [List repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues)
+
+> [gh api](https://cli.github.com/manual/gh_api)
+>
+> `--paginate`
+>   Make additional HTTP requests to fetch all pages of results
+
+Script `get-all-pypi-issues_gh.sh` :
+
+```shell
+#!/bin/bash
+
+$ gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/pypi/support/issues?state=all" --paginate >pypi_issues_gh.json
+```
+
+La base de données est créé et remplie par `import-issues.sql`, voir `.read
+import-json.sql` ci-dessous. 3 tables sont créées :
+
+- `pypi_issues` : table principale
+- `pypi_issues_labels` : table des labels associés aux issues
+- `pypi_issues_events` : table des événements associés aux issues
+
+Avec le fichier json récupéré avec `gh`, on n'a pas d'issues dupliquées. On
+peut déclarer l'`id` en `PRIMARY KEY`.
+
+```
+$ duckdb pypi_issues.db
+v1.0.0 1f98600c2c
+Enter ".help" for usage hints.
+D .read import-json.sql
+D .tables
+pypi_issues
+D select count(id) from pypi_issues;
+┌───────────┐
+│ count(id) │
+│   int64   │
+├───────────┤
+│      4143 │
+└───────────┘
+D SELECT id, count(id) AS nb FROM pypi_issues GROUP BY id HAVING nb > 1;
+┌───────┬───────┐
+│  id   │  nb   │
+│ int64 │ int64 │
+├───────┴───────┤
+│    0 rows     │
+└───────────────┘
+D
+```
+
+## Récupérer les events
+
+Pas moyen de récupérer un event particulier, `closed`, pour une issue données
+via l'API GitHub.
+
+On récupère donc tous les events de toutes les issues et on va trier.
+
+Script `get-all-pypi-issue-events_gh.sh` :
+
+```shell
+#!/bin/bash
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Get $inb issues events";
+    gh api \
+        -H "Accept: application/vnd.github+json" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        "/repos/pypi/support/issues/${inb}/events" \
+        --paginate >"events/pypi_issue_${inb}.json"
+    sleep 0.2;
+done
+```
+
+Filtrage des événements avec `jq`.
+
+```shell
+< pypi_issue_3835.json jq '.[] | select(.event == "closed")'
+```
+
+Utilisé par le script `filtering-closed-events.sh`.
+
+```shell
+#!/bin/bash
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Filtering closed event in events/pypi_issue_${inb}.json"
+    jq '.[] | select(.event == "closed")' "events/pypi_issue_${inb}.json" > "events_closed/pypi_issue_${inb}_closed.json"
+done
+```
+
+Le script `insert-closed-events.sh` insère ensuite les événements `closed` dans
+la BD. *On doit pouvoir combiner et faire beaucoup mieux*.
+
+```shell
+#!/bin/bash
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Filtering closed event in events/pypi_issue_${inb}.json"
+    #         format = 'array', \
+    query="INSERT INTO pypi_issues_events ( \
+            SELECT \
+              ${inb}, \
+              id, \
+              node_id, \
+              actor.id, \
+              actor.login, \
+              actor.node_id, \
+              actor.type, \
+              actor.site_admin, \
+              event, \
+              created_at, \
+              state_reason \
+            FROM read_json( \
+              'events_closed/pypi_issue_${inb}_closed.json', \
+              columns = { \
+                'id': 'BIGINT', \
+                'node_id': 'VARCHAR', \
+                'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \
+                'state': 'VARCHAR', \
+                'event': 'VARCHAR', \
+                'created_at': 'TIMESTAMP', \
+                'state_reason': 'VARCHAR' \
+              } \
+            ) \
+          );"
+
+    # echo $query
+    duckdb pypi_issues.db -noheader -csv -c "${query}"
+done
+```
+
+Cela ne semble pas cohérent entre le nombre d'événements `closed` et le nombre
+d'issues `closed`.
+
+```
+$ duckdb pypi_issues.db
+v1.0.0 1f98600c2c
+Enter ".help" for usage hints.
+D select count(event_id) from pypi_issues_events;
+┌─────────────────┐
+│ count(event_id) │
+│      int64      │
+├─────────────────┤
+│            3365 │
+└─────────────────┘
+D select state, count(id) as nb from pypi_issues group by state;
+┌─────────┬───────┐
+│  state  │  nb   │
+│ varchar │ int64 │
+├─────────┼───────┤
+│ closed  │  3320 │
+│ open    │   823 │
+└─────────┴───────┘
+```
diff --git a/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh b/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh
new file mode 100755
index 0000000..1e78dbc
--- /dev/null
+++ b/issues-with-duckdb_gh/create-issue-db/insert-closed-events.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# ls -a events | while read A
+# do
+#     echo "Filtering closed event in events/$A"
+#     jq '.[] | select(.event == "closed")' "events/$A" > "events_closed/$A"
+# done
+
+# query="INSERT INTO pypi_issues_events (
+# > SELECT ${inb}, id, node_id, state, user.id,
+# >        user.login, user.node_id, user.type, 
+# >        user.site_admin, event, created_at, state_reason 
+# > FROM read_json('events_closed/pypi_issue_${inb}_closed.json',
+# >                format = 'array',
+# >                columns = {'id': 'BIGINT',
+# >                           'number': 'BIGINT',
+# >                           'node_id': 'VARCHAR',
+# >                           'state': 'VARCHAR',
+# >                           'user': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR)',
+# >                           'labels': 'STRUCT(
+# >                                        id BIGINT,
+# >                                        node_id VARCHAR,
+# >                                        url VARCHAR,
+# >                                        'name' VARCHAR,
+# >                                        color VARCHAR,
+# >                                        'default' BOOLEAN,
+# >                                        description VARCHAR
+# >                                        )[]',
+# >                           'created_at': 'TIMESTAMP',
+# >                           'closed_at': 'TIMESTAMP'
+# >                         })
+# > );"
+
+duckdb pypi_issues.db -noheader -csv -c "SELECT number FROM pypi_issues WHERE state='closed'" | while read inb
+do
+    echo "Filtering closed event in events/pypi_issue_${inb}.json"
+    #         format = 'array', \
+    query="INSERT INTO pypi_issues_events ( \
+            SELECT \
+              ${inb}, \
+              id, \
+              node_id, \
+              actor.id, \
+              actor.login, \
+              actor.node_id, \
+              actor.type, \
+              actor.site_admin, \
+              event, \
+              created_at, \
+              state_reason \
+            FROM read_json( \
+              'events_closed/pypi_issue_${inb}_closed.json', \
+              columns = { \
+                'id': 'BIGINT', \
+                'node_id': 'VARCHAR', \
+                'actor': 'STRUCT(login VARCHAR, id BIGINT, node_id VARCHAR, type VARCHAR, site_admin VARCHAR)', \
+                'state': 'VARCHAR', \
+                'event': 'VARCHAR', \
+                'created_at': 'TIMESTAMP', \
+                'state_reason': 'VARCHAR' \
+              } \
+            ) \
+          );"
+
+    # echo $query
+    duckdb pypi_issues.db -noheader -csv -c "${query}"
+done
-- 
GitLab