diff --git a/GEODE/ENE/Detector.py b/GEODE/ENE/Detector.py new file mode 100644 index 0000000000000000000000000000000000000000..5e31d671360f276aaf94afd8169a4d26446a52a3 --- /dev/null +++ b/GEODE/ENE/Detector.py @@ -0,0 +1,25 @@ +from GEODE.Metadata import articleKey, fromKey, toKey +import pandas +import sys + +columns = articleKey + ['event', 'position', 'size', 'totalSize'] + +def eventsOfAnnotation(start, common, annotated): + rows = pandas.DataFrame([{**common, + 'event': span.label_, + 'position': start + span.start_char, + 'size': span.end_char - span.start_char} + for span in annotated.spans['sc']]) + if len(rows): + rows[columns].to_csv(sys.stdout, sep='\t', index=False, header=False) + +def detector(model, corpus): + print(*columns, sep='\t') + for text in corpus.get_all(): + paragraphs = text['content'].split('\n\n') + common = {**fromKey(toKey(text)), + 'totalSize': sum([len(p) for p in paragraphs])} + currentPosition = 0 + for paragraph in paragraphs: + eventsOfAnnotation(currentPosition, common, model(paragraph)) + currentPosition += len(paragraph) diff --git a/GEODE/ENE/__init__.py b/GEODE/ENE/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e05893a8dbe3572a9f5d90175b119b59352a5252 --- /dev/null +++ b/GEODE/ENE/__init__.py @@ -0,0 +1,14 @@ +ene_labels = [ + "Domain-mark", + "Head", + "Latlong", + "NC-Spatial", + "NC-Person", + "NP-Misc", + "NP-Person", + "NP-Spatial", + "Relation", + "ENE-Spatial", + "ENE-Person", + "ENE-Misc" + ] diff --git a/GEODE/ENE/__main__.py b/GEODE/ENE/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..1b39e34aed1fe425123868503aa45cf833c29aae --- /dev/null +++ b/GEODE/ENE/__main__.py @@ -0,0 +1,6 @@ +from GEODE.ENE.Detector import detector +from GEODE.Store import corpus +import spacy +import sys + +detector(spacy.load(sys.argv[1]), corpus(sys.argv[2])) diff --git a/guix.scm b/guix.scm index fb6e3dbfd36094391595b049fd34f9879fb54f81..ec463d136fc27e2b01877ccda10af0d351a87109 100644 --- a/guix.scm +++ b/guix.scm @@ -1,4 +1,4 @@ -(use-modules ((gnu packages machine-learning) #:select (python-pytorch python-scikit-learn)) +(use-modules ((gnu packages machine-learning) #:select (python-pytorch python-scikit-learn python-spacy)) ((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-xyz) #:select (python-matplotlib python-nltk @@ -23,7 +23,8 @@ (list python-matplotlib python-pandas python-scikit-learn - python-seaborn)) + python-seaborn + python-spacy)) (arguments (list #:tests? #f)) (home-page "https://gitlab.liris.cnrs.fr/geode/geopyck") diff --git a/requirements.txt b/requirements.txt index 715fabc1825850671e2e69d32e51cfe3d0a88de4..b8187358a1c9b3056e922b282bdf496801090773 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ +matplotlib pandas scikit-learn -matplotlib seaborn +spacy