Compare revisions

Alice Brenon · Alice Brenon · Alice Brenon · Alice Brenon · 997682af · 997682af
--- a/scripts/ML/BERT/Base.py
+++ b/scripts/ML/BERT/Base.py
--- a/scripts/ML/BERT/Classifier.py
+++ b/scripts/ML/BERT/Classifier.py
--- a/scripts/ML/LabeledData.py
+++ b/scripts/ML/LabeledData.py
--- a/scripts/ML/BERT/Trainer.py
+++ b/scripts/ML/BERT/Trainer.py
 from BERT.Base import BERT
 import datetime
-from loaders import set_random
+import numpy
+import random
 import time
 import torch
 from torch.optim import AdamW
@@ -31,9 +32,13 @@ class Trainer(BERT):
                num_warmup_steps = 0, # Default value in run_glue.py
                num_training_steps = self.epochs * len(data_loader))

-    def __call__(self):
-        set_random()
-        losses = [self.epoch(e) for e in range(self.epochs)]
+    def __call__(self, seed_value=42):
+        random.seed(seed_value)
+        numpy.random.seed(seed_value)
+        torch.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)
+        for e in range(self.epochs):
+            self.epoch(e)
        self.save()
        print("\nTraining complete!")


--- a/scripts/ML/BERT/__init__.py
+++ b/scripts/ML/BERT/__init__.py
 from BERT.Base import BERT
 from BERT.Classifier import Classifier
+from BERT.LabeledData import LabeledData
 from BERT.Trainer import Trainer
--- a/lib/python/GEODE/Classification/__init__.py
+++ b/lib/python/GEODE/Classification/__init__.py
+from GEODE.Classification.discursive import functions as discursiveFunctions
+
+knowledgeDomains = [ 'Agriculture',
+                     'Beaux-arts',
+                     'Belles-lettres',
+                     'Chasse',
+                     'Commerce',
+                     'Droit Jurisprudence',
+                     'Géographie',
+                     'Histoire',
+                     'Histoire naturelle',
+                     'Médecine',
+                     'Métiers',
+                     'Militaire',
+                     'Musique',
+                     'Philosophie',
+                     'Physique',
+                     'Politique',
+                     'Religion' ]
--- a/scripts/ML/GEODE/discursive.py
+++ b/scripts/ML/GEODE/discursive.py
-functions = {'Historical narrative',
+functions = ['Historical narrative',
             'People narrative',
             'Critical',
             'Description',
@@ -6,4 +6,4 @@ functions = {'Historical narrative',
             'Example',
             'Reasoning',
             'Quotation',
-             'Prescriptive'}
+             'Prescriptive']
--- a/lib/python/GEODE/Classification/domains.py
+++ b/lib/python/GEODE/Classification/domains.py
--- a/scripts/ML/GEODE/__init__.py
+++ b/scripts/ML/GEODE/__init__.py
--- a/lib/python/GEODE/Visualisation.py
+++ b/lib/python/GEODE/Visualisation.py
+from GEODE.store import prepare
+import matplotlib.pyplot as plot
+import seaborn
+
+def heatmap(matrix, filePath, labels, **kwargs):
+    plot.figure(figsize=(16,13))
+    if 'cmap' not in kwargs:
+        kwargs['cmap'] = 'Blues'
+    ax = seaborn.heatmap(
+            matrix, xticklabels=labels, yticklabels=labels, **kwargs
+        )
+    plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight')
--- a/lib/python/GEODE/__init__.py
+++ b/lib/python/GEODE/__init__.py
+from GEODE.Classification import discursiveFunctions
+from GEODE.Metadata import article, paragraph, fromKey, relativePath, toKey, uid
+from GEODE.store import corpus, Directory, SelfContained, toTSV
+from GEODE.Visualisation import heatmap
--- a/lib/python/GEODE/signal.py
+++ b/lib/python/GEODE/signal.py
+import math
+
+def curry(f):
+    return lambda x: (lambda *args: f(x, *args))
+
+def gate(n, size, offset=0):
+    return [1 if i == n else 0 for i in range(offset, offset+size)]
+
+@curry
+def orientedIntersection(l, sNew, sOld):
+    left = max(sNew*l[0], sOld*l[1])
+    right = min((sNew+1)*l[0], (sOld+1)*l[1])
+    return max(right-left, 0)
+
+@curry
+def resample(newSize, distribution):
+    oldSize = len(distribution)
+    lcm = math.lcm(newSize, oldSize)
+    intersection = orientedIntersection((lcm/newSize, lcm/oldSize))
+    ratio = oldSize / newSize
+    for i in range(newSize):
+        yield oldSize/lcm*sum([distribution[j]*intersection(i, j)
+                               for j in range(math.floor(i*ratio),
+                                              math.ceil((i+1)*ratio))])
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
-from GEODE import fromKey, relativePath
+from GEODE.Metadata import fromKey, relativePath
+from GEODE.store.TSV import toTSV
 import pandas
 from os import makedirs
 from os.path import dirname, isdir
@@ -54,7 +55,7 @@ class TSVIndexed(Corpus):

    def full(self, key, row):
        d = self.key(key, row)
-        d[self.column_name] = self.content(key, row).strip() + '\n'
+        d[self.column_name] = self.content(key, row).strip()
        return d

    def get_all(self, projector=None, where=None):
@@ -98,7 +99,7 @@ class SelfContained(TSVIndexed):
    def save(self, iterator):
        self.data = pandas.DataFrame(iterator)
        self.detect_keys()
-        self.data.to_csv(self.tsv_path, sep='\t', index=False)
+        toTSV(self.tsv_path, self.data)

 class Directory(TSVIndexed):
    """
@@ -144,7 +145,7 @@ class Directory(TSVIndexed):
        self.detect_keys()
        for _, row in self.data.iterrows():
            self.write_text(row, row[self.column_name])
-        self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
+        toTSV(self.tsv_path, self.data[self.keys])

 def corpus(path, **kwargs):
    if path[-1:] == '/' or isdir(path):

--- a/lib/python/GEODE/store/TSV.py
+++ b/lib/python/GEODE/store/TSV.py
+from GEODE.signal import curry
+from numpy import vectorize
+import pandas
+
+@curry
+def toStrKey(areParagraphs, row):
+    key = "{work}_{volume:02d}_{article:04d}"
+    if areParagraphs:
+        key += "_{paragraph:04d}"
+    return key.format(**row)
+
+def forPanda(data, f):
+    return vectorize(lambda i: f(data.iloc[i]))
+
+def toTSV(filePath, data, sortBy='toStrKey'):
+    if type(data) != pandas.DataFrame:
+        data = pandas.DataFrame(data)
+    if sortBy == 'toStrKey':
+        sortBy = toStrKey('paragraph' in data)
+    if sortBy is None:
+        sortedData = data
+    else:
+        sortedData = data.sort_index(key=forPanda(data, sortBy))
+    sortedData.to_csv(filePath, sep='\t', index=False)
--- a/lib/python/GEODE/store/__init__.py
+++ b/lib/python/GEODE/store/__init__.py
+from GEODE.store.Corpus import corpus, Directory, SelfContained
+from GEODE.store.TSV import toTSV
+import os
+import os.path
+
+def prepare(path):
+    if '/' in path:
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+    return path
--- a/scripts/ML/JSONL.py
+++ b/scripts/ML/JSONL.py
 import json
+import sys

 def load(file_path):
    if type(file_path) == str:
@@ -9,7 +10,22 @@ def load(file_path):
        for line in file_path.readlines():
            yield json.loads(line)

+"""
+def load(file_path):
+    if type(file_path) == str:
+        with open(file_path, 'r') as input_file:
+            return list(loadObjects(input_file))
+    else:
+        return loadObjects(file_path)
+
+def loadObjects(input_file):
+    for line in input_file.readlines():
+        yield json.loads(line)
+"""
+
 def save(file_path, objects):
+    if file_path == '-':
+        file_path = sys.stdin
    if type(file_path) == str:
        with open(file_path, 'w') as output_file:
            saveObjects(output_file, objects)
@@ -18,5 +34,5 @@ def save(file_path, objects):

 def saveObjects(output_file, objects):
    for obj in objects:
-        json.dump(obj, output_file)
+        json.dump(obj, output_file, separators=(',', ':'))
        print(file=output_file)
--- a/lib/python/split/Error.py
+++ b/lib/python/split/Error.py
+from GEODE import uid
+
+def getUID(annotation):
+    return uid(annotation['meta'])
+
+def UnknownAnswer(annotation, answer):
+    print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}")
+
+def TwoAnnotations(annotation, first, second):
+    print(f"Found two annotations for {getUID(annotation)}: " +
+          f"'{first}' and '{second}'")
+
+def Contradiction(annotation, label):
+    print(f"Contradiction found for {getUID(annotation)}: " +
+          f"function {label} should be both accepted and rejected")
+
+def NoLabelLeft(text):
+    print(f"No possible function left for {uid(text)}")
--- a/scripts/ML/GEODE/util.py
+++ b/scripts/ML/GEODE/util.py
@@ -12,3 +12,7 @@ def checkBound(f):
 def parseRatio(s):
    return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))

+def toIterator(*args):
+    for arg in args:
+        for elem in arg:
+            yield elem
--- a/manifest.scm
+++ b/manifest.scm
@@ -7,10 +7,12 @@
             ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
             ((gnu packages haskell-xyz) #:select (ghc-cassava
                                                   ghc-hs-conllu
-                                                   ghc-random))
+                                                   ghc-random
+                                                   ghc-regex-tdfa))
+             ((gnu packages machine-learning) #:select (python-scikit-learn python-spacy))
             ((gnu packages python) #:select (python))
             ((gnu packages python-science) #:select (python-pandas))
-             ((gnu packages python-xyz) #:select (python-beautifulsoup4))
+             ((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn))
             ((gnu packages xml) #:select (python-lxml)))

 ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
@@ -32,12 +34,16 @@
    ghc-hs-conllu ; working on syntax-annotated documents
    ghc-hxt ; working on xml documents
    ghc-random ; sampling data at random
+    ghc-regex-tdfa ; working with regexps in haskell
    processing-lge ; extracting articles from the BnF files
    python ; scripts
    python-beautifulsoup4 ; extract EDdA metadata from TEI files
    ;python-edda ; TODO
    python-lxml ; fusion articles into tomes for TXM
    python-pandas ; working with CSV in python
+    python-scikit-learn ; evaluating models
+    python-seaborn ; draw figures
+    python-spacy ; working with prodigy's custom formats
    python-stanza ; annotation
    sed ; select files from listing
    stanza-fr ; annotation

--- a/scripts/LGE/extract-from-source.sh
+++ b/scripts/LGE/extract-from-source.sh
 #!/bin/sh

-source ${0%/*}/../lib.sh
+source ${0%%/*}/lib/bash.sh

 if [ "$#" != 2 ]
 then
No results found