From c20d86f9886b467a5b3b038364573e5b134a6e91 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 20 Mar 2024 09:46:07 +0100
Subject: [PATCH] Add the code used to annotate ENE and export labels while
 we're at it

---
 GEODE/ENE/Detector.py | 25 +++++++++++++++++++++++++
 GEODE/ENE/__init__.py | 14 ++++++++++++++
 GEODE/ENE/__main__.py |  6 ++++++
 guix.scm              |  5 +++--
 requirements.txt      |  3 ++-
 5 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 GEODE/ENE/Detector.py
 create mode 100644 GEODE/ENE/__init__.py
 create mode 100644 GEODE/ENE/__main__.py

diff --git a/GEODE/ENE/Detector.py b/GEODE/ENE/Detector.py
new file mode 100644
index 0000000..5e31d67
--- /dev/null
+++ b/GEODE/ENE/Detector.py
@@ -0,0 +1,25 @@
+from GEODE.Metadata import articleKey, fromKey, toKey
+import pandas
+import sys
+
+columns = articleKey + ['event', 'position', 'size', 'totalSize']
+
+def eventsOfAnnotation(start, common, annotated):
+    rows = pandas.DataFrame([{**common,
+                              'event': span.label_,
+                              'position': start + span.start_char,
+                              'size': span.end_char - span.start_char}
+                             for span in annotated.spans['sc']])
+    if len(rows):
+        rows[columns].to_csv(sys.stdout, sep='\t', index=False, header=False)
+
+def detector(model, corpus):
+    print(*columns, sep='\t')
+    for text in corpus.get_all():
+        paragraphs = text['content'].split('\n\n')
+        common = {**fromKey(toKey(text)),
+                  'totalSize': sum([len(p) for p in paragraphs])}
+        currentPosition = 0
+        for paragraph in paragraphs:
+            eventsOfAnnotation(currentPosition, common, model(paragraph))
+            currentPosition += len(paragraph)
diff --git a/GEODE/ENE/__init__.py b/GEODE/ENE/__init__.py
new file mode 100644
index 0000000..e05893a
--- /dev/null
+++ b/GEODE/ENE/__init__.py
@@ -0,0 +1,14 @@
+ene_labels = [
+    "Domain-mark",
+    "Head",
+    "Latlong",
+    "NC-Spatial",
+    "NC-Person",
+    "NP-Misc",
+    "NP-Person",
+    "NP-Spatial",
+    "Relation",
+    "ENE-Spatial",
+    "ENE-Person",
+    "ENE-Misc"
+        ]
diff --git a/GEODE/ENE/__main__.py b/GEODE/ENE/__main__.py
new file mode 100644
index 0000000..1b39e34
--- /dev/null
+++ b/GEODE/ENE/__main__.py
@@ -0,0 +1,6 @@
+from GEODE.ENE.Detector import detector
+from GEODE.Store import corpus
+import spacy
+import sys
+
+detector(spacy.load(sys.argv[1]), corpus(sys.argv[2]))
diff --git a/guix.scm b/guix.scm
index fb6e3db..ec463d1 100644
--- a/guix.scm
+++ b/guix.scm
@@ -1,4 +1,4 @@
-(use-modules ((gnu packages machine-learning) #:select (python-pytorch python-scikit-learn))
+(use-modules ((gnu packages machine-learning) #:select (python-pytorch python-scikit-learn python-spacy))
              ((gnu packages python-science) #:select (python-pandas))
              ((gnu packages python-xyz) #:select (python-matplotlib
                                                   python-nltk
@@ -23,7 +23,8 @@
       (list python-matplotlib
             python-pandas
             python-scikit-learn
-            python-seaborn))
+            python-seaborn
+            python-spacy))
     (arguments
      (list #:tests? #f))
     (home-page "https://gitlab.liris.cnrs.fr/geode/geopyck")
diff --git a/requirements.txt b/requirements.txt
index 715fabc..b818735 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
+matplotlib
 pandas 
 scikit-learn
-matplotlib
 seaborn
+spacy
-- 
GitLab