From 2fbe96cd6743c805fce55467d18f8152ff86886c Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 13 Dec 2023 10:13:15 +0100 Subject: [PATCH] Moving python lib out of scripts directory --- {scripts/ML => lib/python}/BERT/Base.py | 0 {scripts/ML => lib/python}/BERT/Classifier.py | 0 .../ML => lib/python/BERT}/LabeledData.py | 0 {scripts/ML => lib/python}/BERT/Trainer.py | 13 ++-- {scripts/ML => lib/python}/BERT/__init__.py | 1 + lib/python/GEODE/Classification/__init__.py | 19 ++++++ .../GEODE/Classification}/discursive.py | 4 +- lib/python/GEODE/Classification/domains.py | 0 .../python/GEODE/Metadata}/__init__.py | 0 lib/python/GEODE/Visualisation.py | 12 ++++ lib/python/GEODE/__init__.py | 4 ++ lib/python/GEODE/signal.py | 24 +++++++ .../ML => lib/python/GEODE/store}/Corpus.py | 9 +-- lib/python/GEODE/store/TSV.py | 24 +++++++ lib/python/GEODE/store/__init__.py | 9 +++ {scripts/ML => lib/python}/JSONL.py | 18 ++++- lib/python/split/Error.py | 18 +++++ .../util.py => lib/python/split/__init__.py | 4 ++ manifest.scm | 6 +- scripts/LGE/extract-from-source.sh | 2 +- scripts/ML/GEODE/Error.py | 5 -- scripts/ML/convert-corpus.py | 4 +- scripts/ML/evaluate.py | 18 ++--- scripts/ML/loaders.py | 10 --- scripts/ML/predictMulti.py | 12 ++-- scripts/ML/predictSimple.py | 9 ++- scripts/ML/prodigy-jsonl-to-tsv.hs | 66 +++++++++++++++++-- scripts/ML/prodigy-tsv-to-jsonl.hs | 59 +++++++++++++---- scripts/ML/prodigyAcceptedJSONLToTSV.py | 12 ++-- scripts/ML/prodigyMultiJSONLToDirectory.py | 56 +++++++--------- scripts/ML/simpleTrainOfMulti.py | 26 -------- scripts/ML/splitMulti.py | 51 +++++++------- scripts/ML/train.py | 9 --- scripts/ML/trainMultiBERT.py | 11 ++-- scripts/ML/trainSimpleBERT.py | 13 ++-- scripts/extract-from-source.sh | 8 +-- 36 files changed, 358 insertions(+), 178 deletions(-) rename {scripts/ML => lib/python}/BERT/Base.py (100%) rename {scripts/ML => lib/python}/BERT/Classifier.py (100%) rename {scripts/ML => lib/python/BERT}/LabeledData.py (100%) rename {scripts/ML => lib/python}/BERT/Trainer.py (88%) rename {scripts/ML => lib/python}/BERT/__init__.py (70%) create mode 100644 lib/python/GEODE/Classification/__init__.py rename {scripts/ML/GEODE => lib/python/GEODE/Classification}/discursive.py (73%) create mode 100644 lib/python/GEODE/Classification/domains.py rename {scripts/ML/GEODE => lib/python/GEODE/Metadata}/__init__.py (100%) create mode 100644 lib/python/GEODE/Visualisation.py create mode 100644 lib/python/GEODE/__init__.py create mode 100644 lib/python/GEODE/signal.py rename {scripts/ML => lib/python/GEODE/store}/Corpus.py (94%) create mode 100644 lib/python/GEODE/store/TSV.py create mode 100644 lib/python/GEODE/store/__init__.py rename {scripts/ML => lib/python}/JSONL.py (57%) create mode 100644 lib/python/split/Error.py rename scripts/ML/GEODE/util.py => lib/python/split/__init__.py (79%) delete mode 100644 scripts/ML/GEODE/Error.py delete mode 100644 scripts/ML/loaders.py delete mode 100755 scripts/ML/simpleTrainOfMulti.py delete mode 100755 scripts/ML/train.py diff --git a/scripts/ML/BERT/Base.py b/lib/python/BERT/Base.py similarity index 100% rename from scripts/ML/BERT/Base.py rename to lib/python/BERT/Base.py diff --git a/scripts/ML/BERT/Classifier.py b/lib/python/BERT/Classifier.py similarity index 100% rename from scripts/ML/BERT/Classifier.py rename to lib/python/BERT/Classifier.py diff --git a/scripts/ML/LabeledData.py b/lib/python/BERT/LabeledData.py similarity index 100% rename from scripts/ML/LabeledData.py rename to lib/python/BERT/LabeledData.py diff --git a/scripts/ML/BERT/Trainer.py b/lib/python/BERT/Trainer.py similarity index 88% rename from scripts/ML/BERT/Trainer.py rename to lib/python/BERT/Trainer.py index 70c2a8f..b094d14 100644 --- a/scripts/ML/BERT/Trainer.py +++ b/lib/python/BERT/Trainer.py @@ -1,6 +1,7 @@ from BERT.Base import BERT import datetime -from loaders import set_random +import numpy +import random import time import torch from torch.optim import AdamW @@ -31,9 +32,13 @@ class Trainer(BERT): num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = self.epochs * len(data_loader)) - def __call__(self): - set_random() - losses = [self.epoch(e) for e in range(self.epochs)] + def __call__(self, seed_value=42): + random.seed(seed_value) + numpy.random.seed(seed_value) + torch.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) + for e in range(self.epochs): + self.epoch(e) self.save() print("\nTraining complete!") diff --git a/scripts/ML/BERT/__init__.py b/lib/python/BERT/__init__.py similarity index 70% rename from scripts/ML/BERT/__init__.py rename to lib/python/BERT/__init__.py index 50cbcc1..ce7c99a 100644 --- a/scripts/ML/BERT/__init__.py +++ b/lib/python/BERT/__init__.py @@ -1,3 +1,4 @@ from BERT.Base import BERT from BERT.Classifier import Classifier +from BERT.LabeledData import LabeledData from BERT.Trainer import Trainer diff --git a/lib/python/GEODE/Classification/__init__.py b/lib/python/GEODE/Classification/__init__.py new file mode 100644 index 0000000..695af95 --- /dev/null +++ b/lib/python/GEODE/Classification/__init__.py @@ -0,0 +1,19 @@ +from GEODE.Classification.discursive import functions as discursiveFunctions + +knowledgeDomains = [ 'Agriculture', + 'Beaux-arts', + 'Belles-lettres', + 'Chasse', + 'Commerce', + 'Droit Jurisprudence', + 'Géographie', + 'Histoire', + 'Histoire naturelle', + 'Médecine', + 'Métiers', + 'Militaire', + 'Musique', + 'Philosophie', + 'Physique', + 'Politique', + 'Religion' ] diff --git a/scripts/ML/GEODE/discursive.py b/lib/python/GEODE/Classification/discursive.py similarity index 73% rename from scripts/ML/GEODE/discursive.py rename to lib/python/GEODE/Classification/discursive.py index 60a958e..3161984 100644 --- a/scripts/ML/GEODE/discursive.py +++ b/lib/python/GEODE/Classification/discursive.py @@ -1,4 +1,4 @@ -functions = {'Historical narrative', +functions = ['Historical narrative', 'People narrative', 'Critical', 'Description', @@ -6,4 +6,4 @@ functions = {'Historical narrative', 'Example', 'Reasoning', 'Quotation', - 'Prescriptive'} + 'Prescriptive'] diff --git a/lib/python/GEODE/Classification/domains.py b/lib/python/GEODE/Classification/domains.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/ML/GEODE/__init__.py b/lib/python/GEODE/Metadata/__init__.py similarity index 100% rename from scripts/ML/GEODE/__init__.py rename to lib/python/GEODE/Metadata/__init__.py diff --git a/lib/python/GEODE/Visualisation.py b/lib/python/GEODE/Visualisation.py new file mode 100644 index 0000000..c60b3ab --- /dev/null +++ b/lib/python/GEODE/Visualisation.py @@ -0,0 +1,12 @@ +from GEODE.store import prepare +import matplotlib.pyplot as plot +import seaborn + +def heatmap(matrix, filePath, labels, **kwargs): + plot.figure(figsize=(16,13)) + if 'cmap' not in kwargs: + kwargs['cmap'] = 'Blues' + ax = seaborn.heatmap( + matrix, xticklabels=labels, yticklabels=labels, **kwargs + ) + plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight') diff --git a/lib/python/GEODE/__init__.py b/lib/python/GEODE/__init__.py new file mode 100644 index 0000000..fd7e6c6 --- /dev/null +++ b/lib/python/GEODE/__init__.py @@ -0,0 +1,4 @@ +from GEODE.Classification import discursiveFunctions +from GEODE.Metadata import article, paragraph, relativePath, toKey, uid +from GEODE.store import corpus, Directory, SelfContained, toTSV +from GEODE.Visualisation import heatmap diff --git a/lib/python/GEODE/signal.py b/lib/python/GEODE/signal.py new file mode 100644 index 0000000..1e2fa18 --- /dev/null +++ b/lib/python/GEODE/signal.py @@ -0,0 +1,24 @@ +import math + +def curry(f): + return lambda x: (lambda *args: f(x, *args)) + +def gate(n, size, offset=0): + return [1 if i == n else 0 for i in range(offset, offset+size)] + +@curry +def orientedIntersection(l, sNew, sOld): + left = max(sNew*l[0], sOld*l[1]) + right = min((sNew+1)*l[0], (sOld+1)*l[1]) + return max(right-left, 0) + +@curry +def resample(newSize, distribution): + oldSize = len(distribution) + lcm = math.lcm(newSize, oldSize) + intersection = orientedIntersection((lcm/newSize, lcm/oldSize)) + ratio = oldSize / newSize + for i in range(newSize): + yield oldSize/lcm*sum([distribution[j]*intersection(i, j) + for j in range(math.floor(i*ratio), + math.ceil((i+1)*ratio))]) diff --git a/scripts/ML/Corpus.py b/lib/python/GEODE/store/Corpus.py similarity index 94% rename from scripts/ML/Corpus.py rename to lib/python/GEODE/store/Corpus.py index 5abf5b3..e72b8b6 100644 --- a/scripts/ML/Corpus.py +++ b/lib/python/GEODE/store/Corpus.py @@ -1,4 +1,5 @@ -from GEODE import fromKey, relativePath +from GEODE.Metadata import fromKey, relativePath +from GEODE.store.TSV import toTSV import pandas from os import makedirs from os.path import dirname, isdir @@ -54,7 +55,7 @@ class TSVIndexed(Corpus): def full(self, key, row): d = self.key(key, row) - d[self.column_name] = self.content(key, row).strip() + '\n' + d[self.column_name] = self.content(key, row).strip() return d def get_all(self, projector=None, where=None): @@ -98,7 +99,7 @@ class SelfContained(TSVIndexed): def save(self, iterator): self.data = pandas.DataFrame(iterator) self.detect_keys() - self.data.to_csv(self.tsv_path, sep='\t', index=False) + toTSV(self.tsv_path, self.data) class Directory(TSVIndexed): """ @@ -144,7 +145,7 @@ class Directory(TSVIndexed): self.detect_keys() for _, row in self.data.iterrows(): self.write_text(row, row[self.column_name]) - self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) + toTSV(self.tsv_path, self.data[self.keys]) def corpus(path, **kwargs): if path[-1:] == '/' or isdir(path): diff --git a/lib/python/GEODE/store/TSV.py b/lib/python/GEODE/store/TSV.py new file mode 100644 index 0000000..3967be3 --- /dev/null +++ b/lib/python/GEODE/store/TSV.py @@ -0,0 +1,24 @@ +from GEODE.signal import curry +from numpy import vectorize +import pandas + +@curry +def toStrKey(areParagraphs, row): + key = "{work}_{volume:02d}_{article:04d}" + if areParagraphs: + key += "_{paragraph:04d}" + return key.format(**row) + +def forPanda(data, f): + return vectorize(lambda i: f(data.iloc[i])) + +def toTSV(filePath, data, sortBy='toStrKey'): + if type(data) != pandas.DataFrame: + data = pandas.DataFrame(data) + if sortBy == 'toStrKey': + sortBy = toStrKey('paragraph' in data) + if sortBy is None: + sortedData = data + else: + sortedData = data.sort_index(key=forPanda(data, sortBy)) + sortedData.to_csv(filePath, sep='\t', index=False) diff --git a/lib/python/GEODE/store/__init__.py b/lib/python/GEODE/store/__init__.py new file mode 100644 index 0000000..ebbd38d --- /dev/null +++ b/lib/python/GEODE/store/__init__.py @@ -0,0 +1,9 @@ +from GEODE.store.Corpus import corpus, Directory, SelfContained +from GEODE.store.TSV import toTSV +import os +import os.path + +def prepare(path): + if '/' in path: + os.makedirs(os.path.dirname(path), exist_ok=True) + return path diff --git a/scripts/ML/JSONL.py b/lib/python/JSONL.py similarity index 57% rename from scripts/ML/JSONL.py rename to lib/python/JSONL.py index 07b2aaf..4e05d05 100644 --- a/scripts/ML/JSONL.py +++ b/lib/python/JSONL.py @@ -1,4 +1,5 @@ import json +import sys def load(file_path): if type(file_path) == str: @@ -9,7 +10,22 @@ def load(file_path): for line in file_path.readlines(): yield json.loads(line) +""" +def load(file_path): + if type(file_path) == str: + with open(file_path, 'r') as input_file: + return list(loadObjects(input_file)) + else: + return loadObjects(file_path) + +def loadObjects(input_file): + for line in input_file.readlines(): + yield json.loads(line) +""" + def save(file_path, objects): + if file_path == '-': + file_path = sys.stdin if type(file_path) == str: with open(file_path, 'w') as output_file: saveObjects(output_file, objects) @@ -18,5 +34,5 @@ def save(file_path, objects): def saveObjects(output_file, objects): for obj in objects: - json.dump(obj, output_file) + json.dump(obj, output_file, separators=(',', ':')) print(file=output_file) diff --git a/lib/python/split/Error.py b/lib/python/split/Error.py new file mode 100644 index 0000000..867e5b8 --- /dev/null +++ b/lib/python/split/Error.py @@ -0,0 +1,18 @@ +from GEODE import uid + +def getUID(annotation): + return uid(annotation['meta']) + +def UnknownAnswer(annotation, answer): + print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}") + +def TwoAnnotations(annotation, first, second): + print(f"Found two annotations for {getUID(annotation)}: " + + f"'{first}' and '{second}'") + +def Contradiction(annotation, label): + print(f"Contradiction found for {getUID(annotation)}: " + + f"function {label} should be both accepted and rejected") + +def NoLabelLeft(text): + print(f"No possible function left for {uid(text)}") diff --git a/scripts/ML/GEODE/util.py b/lib/python/split/__init__.py similarity index 79% rename from scripts/ML/GEODE/util.py rename to lib/python/split/__init__.py index e9945b1..9643512 100644 --- a/scripts/ML/GEODE/util.py +++ b/lib/python/split/__init__.py @@ -12,3 +12,7 @@ def checkBound(f): def parseRatio(s): return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s)) +def toIterator(*args): + for arg in args: + for elem in arg: + yield elem diff --git a/manifest.scm b/manifest.scm index 1234356..4627992 100644 --- a/manifest.scm +++ b/manifest.scm @@ -9,9 +9,10 @@ ghc-hs-conllu ghc-random ghc-regex-tdfa)) + ((gnu packages machine-learning) #:select (python-scikit-learn python-spacy)) ((gnu packages python) #:select (python)) ((gnu packages python-science) #:select (python-pandas)) - ((gnu packages python-xyz) #:select (python-beautifulsoup4)) + ((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn)) ((gnu packages xml) #:select (python-lxml))) ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) @@ -40,6 +41,9 @@ ;python-edda ; TODO python-lxml ; fusion articles into tomes for TXM python-pandas ; working with CSV in python + python-scikit-learn ; evaluating models + python-seaborn ; draw figures + python-spacy ; working with prodigy's custom formats python-stanza ; annotation sed ; select files from listing stanza-fr ; annotation diff --git a/scripts/LGE/extract-from-source.sh b/scripts/LGE/extract-from-source.sh index 75a8883..931e250 100755 --- a/scripts/LGE/extract-from-source.sh +++ b/scripts/LGE/extract-from-source.sh @@ -1,6 +1,6 @@ #!/bin/sh -source ${0%/*}/../lib.sh +source ${0%%/*}/lib/bash.sh if [ "$#" != 2 ] then diff --git a/scripts/ML/GEODE/Error.py b/scripts/ML/GEODE/Error.py deleted file mode 100644 index 09cd467..0000000 --- a/scripts/ML/GEODE/Error.py +++ /dev/null @@ -1,5 +0,0 @@ -from GEODE import uid - -def TwoAnnotations(text, first, second): - textUID = text if type(text) == str else uid(text) - return f"Found two annotations for {textUID}: '{first}' and '{second}'" diff --git a/scripts/ML/convert-corpus.py b/scripts/ML/convert-corpus.py index a37fb2c..6b2b703 100755 --- a/scripts/ML/convert-corpus.py +++ b/scripts/ML/convert-corpus.py @@ -1,5 +1,5 @@ -#!/usr/bin/env python3 -from Corpus import corpus +#!/usr/bin/env -S PYTHONPATH=lib/python python3 +from GEODE import corpus import sys if __name__ == '__main__': diff --git a/scripts/ML/evaluate.py b/scripts/ML/evaluate.py index 104cdad..27d05f0 100755 --- a/scripts/ML/evaluate.py +++ b/scripts/ML/evaluate.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 -from EDdA.classification import heatmap -from EDdA.store import preparePath -import GEODE.discursive as discursive +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from GEODE.Visualisation import heatmap +from GEODE.Classification import discursiveFunctions, knowledgeDomains import pandas from sklearn.metrics import classification_report, confusion_matrix from sys import argv @@ -9,11 +9,13 @@ from sys import argv def evaluate(truth, predictions, outputDirectory): matrix = confusion_matrix(truth, predictions, - labels=list(discursive.functions), + labels=knowledgeDomains, + #labels=discursiveFunctions normalize='true') heatmap(matrix, - preparePath(f"{outputDirectory}/confusion.png"), - labels=discursive.functions) + f"{outputDirectory}/confusion.png", + labels=knowledgeDomains) + #labels=discursiveFunctions) with open(f"{outputDirectory}/report.json", 'w') as json: print(classification_report(truth, predictions, output_dict=True), file=json) @@ -24,4 +26,4 @@ def evaluate(truth, predictions, outputDirectory): if __name__ == '__main__': truth = pandas.read_csv(argv[1], sep='\t') predictions = pandas.read_csv(argv[2], sep='\t') - evaluate(truth['paragraphFunction'], predictions['label'], argv[3]) + evaluate(truth['super_domain'], predictions['label'], argv[3]) diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py deleted file mode 100644 index 93986f4..0000000 --- a/scripts/ML/loaders.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy -import random -import torch - -def set_random(): - seed_value = 42 - random.seed(seed_value) - numpy.random.seed(seed_value) - torch.manual_seed(seed_value) - torch.cuda.manual_seed_all(seed_value) diff --git a/scripts/ML/predictMulti.py b/scripts/ML/predictMulti.py index 80d7eaa..93ab6ab 100755 --- a/scripts/ML/predictMulti.py +++ b/scripts/ML/predictMulti.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from BERT import Classifier -from Corpus import corpus -import GEODE.discursive as discursive +from GEODE import corpus, discursiveFunctions, toTSV import pandas from sys import argv @@ -10,12 +10,12 @@ def rateClass(name, answer, score): def combine(row): classes = [(name, row[name], row[name + 'Score']) - for name in discursive.functions] + for name in discursiveFunctions] return max(classes, key=lambda c: rateClass(*c))[0] def label(modelsRoot, source): records = pandas.DataFrame(source.get_all('key')) - for name in discursive.functions: + for name in discursiveFunctions: classify = Classifier(f"{modelsRoot}/{name}") content = source.get_all('content') records[name], records[name + 'Score'] = classify(content) @@ -23,4 +23,4 @@ def label(modelsRoot, source): return records if __name__ == '__main__': - label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False) + toTSV(argv[3], label(argv[1], corpus(argv[2]))) diff --git a/scripts/ML/predictSimple.py b/scripts/ML/predictSimple.py index 5ba28ad..3f511c8 100755 --- a/scripts/ML/predictSimple.py +++ b/scripts/ML/predictSimple.py @@ -1,7 +1,8 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from BERT import Classifier +from GEODE import corpus, toTSV import pandas -from Corpus import corpus from sys import argv def label(classify, source, name='label'): @@ -26,6 +27,4 @@ def label(classify, source, name='label'): return records if __name__ == '__main__': - classify = Classifier(argv[1]) - source = corpus(argv[2]) - label(classify, source).to_csv(argv[3], sep='\t', index=False) + toTSV(argv[3], label(Classifier(argv[1]), corpus(argv[2]))) diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs index d26e850..c37c505 100755 --- a/scripts/ML/prodigy-jsonl-to-tsv.hs +++ b/scripts/ML/prodigy-jsonl-to-tsv.hs @@ -1,17 +1,38 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} -import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode) -import Data.Aeson.Types (prependFailure, typeMismatch) +import Control.Applicative ((<|>)) +import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT) +import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode) +import Data.Aeson.Types (Parser, prependFailure, typeMismatch) import Data.ByteString.Lazy as BS (null, readFile, split) import Data.ByteString.Lazy.Char8 as BS (unpack) +import Data.String (IsString(..)) +import Data.Text as Text (Text) import Data.Vector as Vector (head) -import GEODE.Metadata (type (@)(..), tsvFile) +import GEODE.Metadata (type (@)(..), Record(..), tsvFile) import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph) import System.Environment (getArgs) import System.Script (try, syntax, warn) +import Text.Printf (printf) data Row = Unclassified String | Full ClassifiedParagraph instance {-# OVERLAPS #-} FromJSON Row where +{- + parseJSON o@(Object v) = do + paragraphMeta@(paragraphRecord :@: _) <- v .: "meta" + --classified <- v .: "accept" >>= parseClassification + classified <- v .: "label" + pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified + where + parseClassification = withArray "Classification" singleValue + singleValue a + | not $ Prelude.null a = + withText "domain" (pure . Right . Classification) (Vector.head a) + singleValue _ = pure $ Left + ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now") + debug record = + "Looks like " ++ uid record ++ " was not classified, ignoring for now" + parseJSON o@(Object v) = do paragraphMeta <- v .: "meta" >>= parseJSON classified <- v .: "accept" >>= parseClassification @@ -24,10 +45,47 @@ instance {-# OVERLAPS #-} FromJSON Row where singleValue _ = pure $ Left ("Looks like " ++ debug ++ " was not classified, ignoring for now") debug = BS.unpack $ encode o +-} + + parseJSON = withObject "Row" parseRow + where + parseRow o = do + paragraphMeta <- o .: "meta" + getRow paragraphMeta + <$> runExceptT (classification o) + getRow paragraphMeta@(paragraphRecord :@: _) = either + (Unclassified . debug paragraphRecord) + (Full . (paragraphMeta :@:) . Classification) + classification :: Object -> ExceptT String Parser Text + classification o = do + getTextField "answer" o >>= isAccept + getTextField "label" + --o .: "label" >>= withText "label" pure + --checkAnswer o = ExceptT + -- ((o .: "answer" >>= withText "answer" (pure . isAccept)) + -- <|> pure (Left "answer field is missing")) + isAccept "accept" = pure () + isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s + --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s + debug record = printf "Ignoring %s: %s" (uid record) + +getTextField :: String -> Object -> ExceptT String Parser Text +getTextField name o = getField >>= ensureIsText + where + getField :: ExceptT String Parser Value + getField = ExceptT $ + (Right <$> (o .: fromString name)) <|> catch "is missing" + ensureIsText :: Value -> ExceptT String Parser Text + ensureIsText v = ExceptT $ + withText name (pure . Right) v <|> catch "is not text" + catch :: String -> Parser (Either String a) + catch = pure . Left . printf "%s field %s" name +{- parseJSON invalid = - prependFailure "parsing ClassifiedParagraph failed, " + prependFailure "parsing Row failed, " (typeMismatch "Object" invalid) +-} logIgnored :: [Row] -> IO [ClassifiedParagraph] logIgnored = foldr keepFull (pure []) diff --git a/scripts/ML/prodigy-tsv-to-jsonl.hs b/scripts/ML/prodigy-tsv-to-jsonl.hs index 51cf7a6..8d6e421 100755 --- a/scripts/ML/prodigy-tsv-to-jsonl.hs +++ b/scripts/ML/prodigy-tsv-to-jsonl.hs @@ -1,12 +1,15 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding) import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) import Data.Text (Text) import Data.Text.IO as Text (readFile) -import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv) -import GEODE.Metadata.ProdigyMeta - (Classification(..), ClassifiedParagraph, ParagraphMeta) +import GEODE.Metadata + ( type (@)(..), DefaultFields(..), HasDefaultHeader(..), Record(..) + , readNamedTsv ) +import GEODE.Metadata.ProdigyMeta (ParagraphMeta) +-- (Classification(..), ClassifiedParagraph, ParagraphMeta) import GHC.Generics (Generic) import System.Environment (getArgs) import System.FilePath ((</>)) @@ -15,21 +18,51 @@ import System.Script (syntax, try) data Paragraph = Paragraph { text :: Text , meta :: ParagraphMeta - , accept :: [Text] } deriving Generic + --, label :: Text + } deriving Generic + +newtype DatasetContent = DatasetContent { content :: Text } deriving Generic + +instance ToNamedRecord DatasetContent +instance FromNamedRecord DatasetContent +instance HasDefaultHeader DatasetContent where + defaultFields = DefaultFields ["content"] + +type DatasetRow = ParagraphMeta @ DatasetContent + +{- +data Paragraph = Paragraph + { text :: Text + , meta :: ParagraphMeta + , accept :: [Text] + } deriving Generic +-} instance ToJSON Paragraph where toEncoding = genericToEncoding defaultOptions -loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph -loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do +--loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph +fromFile :: FilePath -> ParagraphMeta -> IO Paragraph +--loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do +fromFile source meta@(paragraphRecord :@: _) = do text <- Text.readFile (source </> relativePath paragraphRecord "txt") - pure $ Paragraph {text, meta, accept = [paragraphFunction classification]} + --pure $ Paragraph {text, meta, accept = []} + --pure $ Paragraph {text, meta, accept = [paragraphFunction classification], answer = "accept"} + --pure $ Paragraph {text, meta, label = paragraphFunction classification} + pure $ Paragraph {text, meta} + +fromRow :: DatasetRow -> IO Paragraph +fromRow (meta :@: (DatasetContent {content})) = + pure $ Paragraph {text = content, meta} main :: IO () main = getArgs >>= run where - run [inputMeta, source] = - try (readNamedTsv inputMeta) >>= mapM_ (printJSON source) - run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY" - printJSON source parMeta = - loadParagraph source parMeta >>= ByteString.putStrLn . encode + run [dataset] = f fromRow dataset + run [inputMeta, source] = f (fromFile source) inputMeta +-- try (readNamedTsv inputMeta) >>= mapM_ (printJSON source) + run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY | TSV_DATASET" + f loader input = + try (readNamedTsv input) >>= mapM_ (\row -> loader row >>= ByteString.putStrLn . encode) + --printJSON source parMeta = + -- loadParagraph source parMeta >>= ByteString.putStrLn . encode diff --git a/scripts/ML/prodigyAcceptedJSONLToTSV.py b/scripts/ML/prodigyAcceptedJSONLToTSV.py index 2395caa..d0fd491 100755 --- a/scripts/ML/prodigyAcceptedJSONLToTSV.py +++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 -from GEODE import toKey +from GEODE import toKey, toTSV import pandas import JSONL import sys -def tsv_row(annotation): +def toRow(annotation): return {'work': annotation['meta']['work'], 'volume': annotation['meta']['volume'], 'article': annotation['meta']['article'], @@ -15,9 +15,9 @@ def tsv_row(annotation): } def acceptedToTSV(inputJSONL, outputTSV): - annotations = pandas.DataFrame( - sorted([tsv_row(a) for a in inputJSONL], key=toKey)) - annotations.to_csv(outputTSV, sep='\t', index=False) + toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept']) + #toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'], + # sortBy=None) if __name__ == '__main__': acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2]) diff --git a/scripts/ML/prodigyMultiJSONLToDirectory.py b/scripts/ML/prodigyMultiJSONLToDirectory.py index cf3ccae..0c1227e 100755 --- a/scripts/ML/prodigyMultiJSONLToDirectory.py +++ b/scripts/ML/prodigyMultiJSONLToDirectory.py @@ -1,16 +1,12 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 -from Corpus import Directory -from GEODE import toKey, uid -import GEODE.discursive as discursive -from GEODE.util import initialise -import pandas +from GEODE import Directory, discursiveFunctions, toKey, toTSV, uid +from split import initialise, toIterator +from split.Error import Contradiction, NoLabelLeft, TwoAnnotations, UnknownAnswer +from unbalanceLimiter import unbalanceLimiter import JSONL import sys -def subDict(d, keys): - return {key: d[key] for key in keys} - def initialiseTexts(texts, key, annotation): initialise(texts, key, @@ -26,7 +22,7 @@ def byLabel(annotations): answer = annotation['answer'] initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []}) if answer not in labels[label]: - print(f"Unsupported answer '{answer}' for annotation {annotation}") + UnknownAnswer(annotation, answer) else: labels[label][answer].append(annotation) return labels @@ -34,7 +30,6 @@ def byLabel(annotations): def erase(texts, error, key, reason): error[key] = texts[key]['row'] del texts[key] - print(reason) def accept(texts, errors, label, accepted): for annotation in accepted: @@ -43,8 +38,8 @@ def accept(texts, errors, label, accepted): initialiseTexts(texts, key, annotation) previous = texts[key]['accept'] if previous is not None: - reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'" - erase(texts, errors, key, reason) + TwoAnnotations(annotation, previous, label) + erase(texts, errors, key) else: texts[key]['accept'] = label @@ -55,19 +50,21 @@ def reject(texts, errors, label, rejected): initialiseTexts(texts, key, annotation) previous = texts[key]['accept'] if previous is not None and previous == label: - erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected") + Contradiction(annotation, label) + erase(texts, errors, key) else: texts[key]['reject'].add(label) def checkRejects(texts, errors): for key, text in texts.items(): countRejected = len(text['reject']) - countFunctions = len(discursive.functions) + countFunctions = len(discursiveFunctions) if countRejected == countFunctions: - reason = f"No possible function left for {uid(text['row'])}" - erase(texts, errors, key, reason) + NoLabelLeft(text['row']) + erase(texts, errors, key) elif text['accept'] is None and countRejected == countFunctions - 1: - text['accept'] = discursive.functions.difference(text['reject']).pop() + free = set(discursiveFunctions).difference(text['reject']) + text['accept'] = free.pop() print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected") def byText(byLabelAnnotations): @@ -79,19 +76,9 @@ def byText(byLabelAnnotations): checkRejects(texts, errors) return texts.values(), errors.values() -def toTsv(filePath, data): - rows = sorted(data, key=toKey) - pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False) - -def toIterator(*args): - for arg in args: - for elem in arg: - yield elem - def exportCorpus(rootDirectory, texts, errors): corpus = Directory(rootDirectory) - corpus.save(sorted(toIterator([t['row'] for t in texts], errors), - key=toKey)) + corpus.save(toIterator([t['row'] for t in texts], errors)) def indexByKey(annotations): return {toKey(annotation['meta']): annotation for annotation in annotations} @@ -110,16 +97,19 @@ def toRow(answer): def exportLabels(rootDirectory, labels): for label, answers in labels.items(): - toTsv(f"{rootDirectory}/{label}.tsv", - toIterator(map(toRow('accept'), answers['accept']), - map(toRow('reject'), allRejects(labels, label)))) + toTSV(f"{rootDirectory}/{label}.tsv", + unbalanceLimiter( + toIterator(map(toRow('accept'), answers['accept']), + map(toRow('reject'), allRejects(labels, label))), + maxRatio=4, + attribute='answer')) def multiJSONLToDirectory(jsonl, outputDirectory): byLabelAnnotations = byLabel(jsonl) texts, errors = byText(byLabelAnnotations) exportCorpus(outputDirectory, texts, errors) if len(errors) > 0: - toTsv(f"{outputDirectory}/errors.tsv", errors) + toTSV(f"{outputDirectory}/errors.tsv", errors) exportLabels(outputDirectory, byLabelAnnotations) if __name__ == '__main__': diff --git a/scripts/ML/simpleTrainOfMulti.py b/scripts/ML/simpleTrainOfMulti.py deleted file mode 100755 index 5f80001..0000000 --- a/scripts/ML/simpleTrainOfMulti.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -from Corpus import Directory, SelfContained -from GEODE import fromKey, toKey -import GEODE.discursive as discursive -from prodigyAcceptedJSONLToTSV import acceptedToTSV -from sys import argv - -def isAccepted(key, row): - return row['answer'] == 'accept' - -def withLabel(corpus, label): - return lambda key, row: dict(**corpus.full(key, row) - , paragraphFunction=label) - -def simpleTrainOfMulti(multiDirectory, outputTSV): - annotations = [] - for className in discursive.functions: - corpus = Directory(multiDirectory, tsv_filename=className) - p = withLabel(corpus, className) - annotations += list(corpus.get_all(projector=p, where=isAccepted)) - output = SelfContained(outputTSV) - output.save(sorted(annotations, key=toKey)) - -if __name__ == '__main__': - simpleTrainOfMulti(argv[1], argv[2]) diff --git a/scripts/ML/splitMulti.py b/scripts/ML/splitMulti.py index d5cd2a7..9de1423 100755 --- a/scripts/ML/splitMulti.py +++ b/scripts/ML/splitMulti.py @@ -1,13 +1,11 @@ -#!/usr/bin/env python3 -from Corpus import Directory +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from GEODE import toKey -from GEODE.Error import TwoAnnotations -from GEODE.util import initialise, parseRatio +from split import initialise, parseRatio +from split.Error import TwoAnnotations import JSONL from random import shuffle from sys import argv, stdin -from prodigyAcceptedJSONLToTSV import acceptedToTSV -from prodigyMultiJSONLToDirectory import multiJSONLToDirectory def getTexts(inputJSONL): texts = {} @@ -16,19 +14,20 @@ def getTexts(inputJSONL): key = toKey(annotation['meta']) if key not in errors: initialise(texts, key, {'accept': None, 'reject': []}) - if annotation['answer'] == 'accept': - previous = texts[key]['accept'] - if previous is None: - texts[key]['accept'] = annotation - else: - print(TwoAnnotations(annotations['meta'], - previous['label'], - texts[key]['label'])) - errors.add(key) - else: - texts[key]['reject'].append(annotation) + sortByAnswer(texts, errors, key, annotation) return texts +def sortByAnswer(texts, errors, key, annotation): + if annotation['answer'] == 'accept': + previous = texts[key]['accept'] + if previous is None: + texts[key]['accept'] = annotation + else: + TwoAnnotations(annotation, previous['label'], texts[key]['label']) + errors.add(key) + else: + texts[key]['reject'].append(annotation) + def getTest(texts, trainRatio): accepted = [key for key, t in texts.items() if t['accept'] is not None] shuffle(accepted) @@ -42,16 +41,22 @@ def allAnnotations(text): return [text['accept']] + text['reject'] def getTrain(texts, test): - return [annotation - for key in sorted(texts.keys()) if key not in test - for annotation in allAnnotations(texts[key])] + train = [] + waste = [] + for key in sorted(texts.keys()): + if key not in test: + train += allAnnotations(texts[key]) + else: + waste += texts[key]['reject'] + return train, waste def splitMulti(jsonl, trainRatio, trainOutput, testOutput): texts = getTexts(jsonl) test = getTest(texts, trainRatio) - train = getTrain(texts, test) - multiJSONLToDirectory(train, trainOutput) - acceptedToTSV(test.values(), testOutput) + train, waste = getTrain(texts, test) + print(f"{len(waste)} negative annotations about texts in the test set have been discarded from the training one") + JSONL.save(trainOutput, train) + JSONL.save(testOutput, test.values()) if __name__ == '__main__': splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3]) diff --git a/scripts/ML/train.py b/scripts/ML/train.py deleted file mode 100755 index 95b812f..0000000 --- a/scripts/ML/train.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python3 -from BERT import Trainer -from LabeledData import LabeledData -import sys - -if __name__ == '__main__': - labeled_data = LabeledData(sys.argv[1]) - trainer = Trainer(sys.argv[2], labeled_data) - trainer() diff --git a/scripts/ML/trainMultiBERT.py b/scripts/ML/trainMultiBERT.py index 43cc10b..f8eae39 100755 --- a/scripts/ML/trainMultiBERT.py +++ b/scripts/ML/trainMultiBERT.py @@ -1,8 +1,7 @@ -#!/usr/bin/env python3 -from BERT import BERT, Trainer -from Corpus import Directory -import GEODE.discursive as discursive -from LabeledData import LabeledData +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from BERT import BERT, LabeledData, Trainer +from GEODE import Directory, discursiveFunctions import os import sys @@ -20,5 +19,5 @@ def trainSubClassifier(trainRoot, modelRoot, className): trainer() if __name__ == '__main__': - for className in discursive.functions: + for className in discursiveFunctions: trainSubClassifier(sys.argv[1], sys.argv[2], className) diff --git a/scripts/ML/trainSimpleBERT.py b/scripts/ML/trainSimpleBERT.py index d869b4a..28107d3 100755 --- a/scripts/ML/trainSimpleBERT.py +++ b/scripts/ML/trainSimpleBERT.py @@ -1,10 +1,13 @@ -#!/usr/bin/env python3 -from Corpus import corpus -from BERT import Trainer -from LabeledData import LabeledData +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from BERT import Trainer, LabeledData +from GEODE import corpus +import os import sys if __name__ == '__main__': labeled_data = LabeledData(corpus(sys.argv[1]), "paragraphFunction") - trainer = Trainer(sys.argv[2], labeled_data) + modelPath = sys.argv[2] + os.makedirs(modelPath, exist_ok=True) + trainer = Trainer(modelPath, labeled_data) trainer() diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh index ba74ce0..f89f885 100755 --- a/scripts/extract-from-source.sh +++ b/scripts/extract-from-source.sh @@ -1,8 +1,8 @@ #!/bin/sh -BASE_DIR="${0%/*}" +BASE_DIR="${0%%/*}" -source ${BASE_DIR}/lib.sh +source ${BASE_DIR}/lib/bash.sh if [ "$#" != 2 ] then @@ -16,5 +16,5 @@ fi FILES_TSV="${TARGET}/files.tsv" printf "book tome rank headWord name page\n" > "${FILES_TSV}" -${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" -${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" +${BASE_DIR}/scripts/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" +${BASE_DIR}/scripts/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" -- GitLab