diff --git a/scripts/ML/BERT/Base.py b/lib/python/BERT/Base.py similarity index 100% rename from scripts/ML/BERT/Base.py rename to lib/python/BERT/Base.py diff --git a/scripts/ML/BERT/Classifier.py b/lib/python/BERT/Classifier.py similarity index 100% rename from scripts/ML/BERT/Classifier.py rename to lib/python/BERT/Classifier.py diff --git a/scripts/ML/LabeledData.py b/lib/python/BERT/LabeledData.py similarity index 100% rename from scripts/ML/LabeledData.py rename to lib/python/BERT/LabeledData.py diff --git a/scripts/ML/BERT/Trainer.py b/lib/python/BERT/Trainer.py similarity index 88% rename from scripts/ML/BERT/Trainer.py rename to lib/python/BERT/Trainer.py index 70c2a8f4b75b068193e12ed343ae6e7e4cc8b580..b094d141ad9a403ad70b2acde3567ed1e5a413ab 100644 --- a/scripts/ML/BERT/Trainer.py +++ b/lib/python/BERT/Trainer.py @@ -1,6 +1,7 @@ from BERT.Base import BERT import datetime -from loaders import set_random +import numpy +import random import time import torch from torch.optim import AdamW @@ -31,9 +32,13 @@ class Trainer(BERT): num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = self.epochs * len(data_loader)) - def __call__(self): - set_random() - losses = [self.epoch(e) for e in range(self.epochs)] + def __call__(self, seed_value=42): + random.seed(seed_value) + numpy.random.seed(seed_value) + torch.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) + for e in range(self.epochs): + self.epoch(e) self.save() print("\nTraining complete!") diff --git a/scripts/ML/BERT/__init__.py b/lib/python/BERT/__init__.py similarity index 70% rename from scripts/ML/BERT/__init__.py rename to lib/python/BERT/__init__.py index 50cbcc17fd40517cb519e00e8e1c03a857e2d5b4..ce7c99af4e07983a777df066d071a4b689b65d93 100644 --- a/scripts/ML/BERT/__init__.py +++ b/lib/python/BERT/__init__.py @@ -1,3 +1,4 @@ from BERT.Base import BERT from BERT.Classifier import Classifier +from BERT.LabeledData import LabeledData from BERT.Trainer import Trainer diff --git a/lib/python/GEODE/Classification/__init__.py b/lib/python/GEODE/Classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..695af95093383c9c9b5c616f5ad0c1bc2fb2229c --- /dev/null +++ b/lib/python/GEODE/Classification/__init__.py @@ -0,0 +1,19 @@ +from GEODE.Classification.discursive import functions as discursiveFunctions + +knowledgeDomains = [ 'Agriculture', + 'Beaux-arts', + 'Belles-lettres', + 'Chasse', + 'Commerce', + 'Droit Jurisprudence', + 'Géographie', + 'Histoire', + 'Histoire naturelle', + 'Médecine', + 'Métiers', + 'Militaire', + 'Musique', + 'Philosophie', + 'Physique', + 'Politique', + 'Religion' ] diff --git a/scripts/ML/GEODE/discursive.py b/lib/python/GEODE/Classification/discursive.py similarity index 73% rename from scripts/ML/GEODE/discursive.py rename to lib/python/GEODE/Classification/discursive.py index 60a958e3d9a76079209cdb9297a2da6a13ff3aed..31619849a31ca8ac06591b214bf64f0517cb18b9 100644 --- a/scripts/ML/GEODE/discursive.py +++ b/lib/python/GEODE/Classification/discursive.py @@ -1,4 +1,4 @@ -functions = {'Historical narrative', +functions = ['Historical narrative', 'People narrative', 'Critical', 'Description', @@ -6,4 +6,4 @@ functions = {'Historical narrative', 'Example', 'Reasoning', 'Quotation', - 'Prescriptive'} + 'Prescriptive'] diff --git a/lib/python/GEODE/Classification/domains.py b/lib/python/GEODE/Classification/domains.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/ML/GEODE/__init__.py b/lib/python/GEODE/Metadata/__init__.py similarity index 100% rename from scripts/ML/GEODE/__init__.py rename to lib/python/GEODE/Metadata/__init__.py diff --git a/lib/python/GEODE/Visualisation.py b/lib/python/GEODE/Visualisation.py new file mode 100644 index 0000000000000000000000000000000000000000..c60b3ab80243c606a7ca0dfba2b3509e5cc17d87 --- /dev/null +++ b/lib/python/GEODE/Visualisation.py @@ -0,0 +1,12 @@ +from GEODE.store import prepare +import matplotlib.pyplot as plot +import seaborn + +def heatmap(matrix, filePath, labels, **kwargs): + plot.figure(figsize=(16,13)) + if 'cmap' not in kwargs: + kwargs['cmap'] = 'Blues' + ax = seaborn.heatmap( + matrix, xticklabels=labels, yticklabels=labels, **kwargs + ) + plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight') diff --git a/lib/python/GEODE/__init__.py b/lib/python/GEODE/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fd7e6c6d1913a05ab2d06fca4d185ca3fcad4264 --- /dev/null +++ b/lib/python/GEODE/__init__.py @@ -0,0 +1,4 @@ +from GEODE.Classification import discursiveFunctions +from GEODE.Metadata import article, paragraph, relativePath, toKey, uid +from GEODE.store import corpus, Directory, SelfContained, toTSV +from GEODE.Visualisation import heatmap diff --git a/lib/python/GEODE/signal.py b/lib/python/GEODE/signal.py new file mode 100644 index 0000000000000000000000000000000000000000..1e2fa18db9f3eaa8bc61ca6aacf348c531469388 --- /dev/null +++ b/lib/python/GEODE/signal.py @@ -0,0 +1,24 @@ +import math + +def curry(f): + return lambda x: (lambda *args: f(x, *args)) + +def gate(n, size, offset=0): + return [1 if i == n else 0 for i in range(offset, offset+size)] + +@curry +def orientedIntersection(l, sNew, sOld): + left = max(sNew*l[0], sOld*l[1]) + right = min((sNew+1)*l[0], (sOld+1)*l[1]) + return max(right-left, 0) + +@curry +def resample(newSize, distribution): + oldSize = len(distribution) + lcm = math.lcm(newSize, oldSize) + intersection = orientedIntersection((lcm/newSize, lcm/oldSize)) + ratio = oldSize / newSize + for i in range(newSize): + yield oldSize/lcm*sum([distribution[j]*intersection(i, j) + for j in range(math.floor(i*ratio), + math.ceil((i+1)*ratio))]) diff --git a/scripts/ML/Corpus.py b/lib/python/GEODE/store/Corpus.py similarity index 94% rename from scripts/ML/Corpus.py rename to lib/python/GEODE/store/Corpus.py index 5abf5b3a96754eb917495539d4de5862edfbb92e..e72b8b60fb9a8cd5f18caa54d817c170c295e2e5 100644 --- a/scripts/ML/Corpus.py +++ b/lib/python/GEODE/store/Corpus.py @@ -1,4 +1,5 @@ -from GEODE import fromKey, relativePath +from GEODE.Metadata import fromKey, relativePath +from GEODE.store.TSV import toTSV import pandas from os import makedirs from os.path import dirname, isdir @@ -54,7 +55,7 @@ class TSVIndexed(Corpus): def full(self, key, row): d = self.key(key, row) - d[self.column_name] = self.content(key, row).strip() + '\n' + d[self.column_name] = self.content(key, row).strip() return d def get_all(self, projector=None, where=None): @@ -98,7 +99,7 @@ class SelfContained(TSVIndexed): def save(self, iterator): self.data = pandas.DataFrame(iterator) self.detect_keys() - self.data.to_csv(self.tsv_path, sep='\t', index=False) + toTSV(self.tsv_path, self.data) class Directory(TSVIndexed): """ @@ -144,7 +145,7 @@ class Directory(TSVIndexed): self.detect_keys() for _, row in self.data.iterrows(): self.write_text(row, row[self.column_name]) - self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) + toTSV(self.tsv_path, self.data[self.keys]) def corpus(path, **kwargs): if path[-1:] == '/' or isdir(path): diff --git a/lib/python/GEODE/store/TSV.py b/lib/python/GEODE/store/TSV.py new file mode 100644 index 0000000000000000000000000000000000000000..3967be3a13e1f658c26c39e972c843084f0fdac0 --- /dev/null +++ b/lib/python/GEODE/store/TSV.py @@ -0,0 +1,24 @@ +from GEODE.signal import curry +from numpy import vectorize +import pandas + +@curry +def toStrKey(areParagraphs, row): + key = "{work}_{volume:02d}_{article:04d}" + if areParagraphs: + key += "_{paragraph:04d}" + return key.format(**row) + +def forPanda(data, f): + return vectorize(lambda i: f(data.iloc[i])) + +def toTSV(filePath, data, sortBy='toStrKey'): + if type(data) != pandas.DataFrame: + data = pandas.DataFrame(data) + if sortBy == 'toStrKey': + sortBy = toStrKey('paragraph' in data) + if sortBy is None: + sortedData = data + else: + sortedData = data.sort_index(key=forPanda(data, sortBy)) + sortedData.to_csv(filePath, sep='\t', index=False) diff --git a/lib/python/GEODE/store/__init__.py b/lib/python/GEODE/store/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebbd38d9b3dac61626996636728c1089c8c5938a --- /dev/null +++ b/lib/python/GEODE/store/__init__.py @@ -0,0 +1,9 @@ +from GEODE.store.Corpus import corpus, Directory, SelfContained +from GEODE.store.TSV import toTSV +import os +import os.path + +def prepare(path): + if '/' in path: + os.makedirs(os.path.dirname(path), exist_ok=True) + return path diff --git a/scripts/ML/JSONL.py b/lib/python/JSONL.py similarity index 57% rename from scripts/ML/JSONL.py rename to lib/python/JSONL.py index 07b2aafb8075e0f12a1332b9bebaf1b9abcefbcb..4e05d05b12d1dad71ed3575740bd287b546eb14d 100644 --- a/scripts/ML/JSONL.py +++ b/lib/python/JSONL.py @@ -1,4 +1,5 @@ import json +import sys def load(file_path): if type(file_path) == str: @@ -9,7 +10,22 @@ def load(file_path): for line in file_path.readlines(): yield json.loads(line) +""" +def load(file_path): + if type(file_path) == str: + with open(file_path, 'r') as input_file: + return list(loadObjects(input_file)) + else: + return loadObjects(file_path) + +def loadObjects(input_file): + for line in input_file.readlines(): + yield json.loads(line) +""" + def save(file_path, objects): + if file_path == '-': + file_path = sys.stdin if type(file_path) == str: with open(file_path, 'w') as output_file: saveObjects(output_file, objects) @@ -18,5 +34,5 @@ def save(file_path, objects): def saveObjects(output_file, objects): for obj in objects: - json.dump(obj, output_file) + json.dump(obj, output_file, separators=(',', ':')) print(file=output_file) diff --git a/lib/python/split/Error.py b/lib/python/split/Error.py new file mode 100644 index 0000000000000000000000000000000000000000..867e5b81046483724c4cca87ae05866291fd69f0 --- /dev/null +++ b/lib/python/split/Error.py @@ -0,0 +1,18 @@ +from GEODE import uid + +def getUID(annotation): + return uid(annotation['meta']) + +def UnknownAnswer(annotation, answer): + print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}") + +def TwoAnnotations(annotation, first, second): + print(f"Found two annotations for {getUID(annotation)}: " + + f"'{first}' and '{second}'") + +def Contradiction(annotation, label): + print(f"Contradiction found for {getUID(annotation)}: " + + f"function {label} should be both accepted and rejected") + +def NoLabelLeft(text): + print(f"No possible function left for {uid(text)}") diff --git a/scripts/ML/GEODE/util.py b/lib/python/split/__init__.py similarity index 79% rename from scripts/ML/GEODE/util.py rename to lib/python/split/__init__.py index e9945b1f7f8bb76a860b8a96369d315b1b23db15..964351201ad49fc13977f2286dde6b5d238cac3c 100644 --- a/scripts/ML/GEODE/util.py +++ b/lib/python/split/__init__.py @@ -12,3 +12,7 @@ def checkBound(f): def parseRatio(s): return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s)) +def toIterator(*args): + for arg in args: + for elem in arg: + yield elem diff --git a/manifest.scm b/manifest.scm index 1234356fd45d5299b68bf303fb95718f234bdf34..4627992d0b81e0f0e4a03217e521f36b4b2cdde7 100644 --- a/manifest.scm +++ b/manifest.scm @@ -9,9 +9,10 @@ ghc-hs-conllu ghc-random ghc-regex-tdfa)) + ((gnu packages machine-learning) #:select (python-scikit-learn python-spacy)) ((gnu packages python) #:select (python)) ((gnu packages python-science) #:select (python-pandas)) - ((gnu packages python-xyz) #:select (python-beautifulsoup4)) + ((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn)) ((gnu packages xml) #:select (python-lxml))) ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) @@ -40,6 +41,9 @@ ;python-edda ; TODO python-lxml ; fusion articles into tomes for TXM python-pandas ; working with CSV in python + python-scikit-learn ; evaluating models + python-seaborn ; draw figures + python-spacy ; working with prodigy's custom formats python-stanza ; annotation sed ; select files from listing stanza-fr ; annotation diff --git a/scripts/LGE/extract-from-source.sh b/scripts/LGE/extract-from-source.sh index 75a88831e756b340594bc5d524ba23b9af67e937..931e2507ff3088fbd4e41cc73687afc20f50f1d9 100755 --- a/scripts/LGE/extract-from-source.sh +++ b/scripts/LGE/extract-from-source.sh @@ -1,6 +1,6 @@ #!/bin/sh -source ${0%/*}/../lib.sh +source ${0%%/*}/lib/bash.sh if [ "$#" != 2 ] then diff --git a/scripts/ML/GEODE/Error.py b/scripts/ML/GEODE/Error.py deleted file mode 100644 index 09cd467395fb772ed3fecd9b73f4c65613af8a81..0000000000000000000000000000000000000000 --- a/scripts/ML/GEODE/Error.py +++ /dev/null @@ -1,5 +0,0 @@ -from GEODE import uid - -def TwoAnnotations(text, first, second): - textUID = text if type(text) == str else uid(text) - return f"Found two annotations for {textUID}: '{first}' and '{second}'" diff --git a/scripts/ML/convert-corpus.py b/scripts/ML/convert-corpus.py index a37fb2c0e623b07542f75a8127aed6cbe43352d9..6b2b7039f1904f2b636637a25a01f310929ebe81 100755 --- a/scripts/ML/convert-corpus.py +++ b/scripts/ML/convert-corpus.py @@ -1,5 +1,5 @@ -#!/usr/bin/env python3 -from Corpus import corpus +#!/usr/bin/env -S PYTHONPATH=lib/python python3 +from GEODE import corpus import sys if __name__ == '__main__': diff --git a/scripts/ML/evaluate.py b/scripts/ML/evaluate.py index 104cdad5d97c4baff1b1629a7ac3a253c797e073..27d05f01f9a8899387dc747fb7309060f70dc492 100755 --- a/scripts/ML/evaluate.py +++ b/scripts/ML/evaluate.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 -from EDdA.classification import heatmap -from EDdA.store import preparePath -import GEODE.discursive as discursive +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from GEODE.Visualisation import heatmap +from GEODE.Classification import discursiveFunctions, knowledgeDomains import pandas from sklearn.metrics import classification_report, confusion_matrix from sys import argv @@ -9,11 +9,13 @@ from sys import argv def evaluate(truth, predictions, outputDirectory): matrix = confusion_matrix(truth, predictions, - labels=list(discursive.functions), + labels=knowledgeDomains, + #labels=discursiveFunctions normalize='true') heatmap(matrix, - preparePath(f"{outputDirectory}/confusion.png"), - labels=discursive.functions) + f"{outputDirectory}/confusion.png", + labels=knowledgeDomains) + #labels=discursiveFunctions) with open(f"{outputDirectory}/report.json", 'w') as json: print(classification_report(truth, predictions, output_dict=True), file=json) @@ -24,4 +26,4 @@ def evaluate(truth, predictions, outputDirectory): if __name__ == '__main__': truth = pandas.read_csv(argv[1], sep='\t') predictions = pandas.read_csv(argv[2], sep='\t') - evaluate(truth['paragraphFunction'], predictions['label'], argv[3]) + evaluate(truth['super_domain'], predictions['label'], argv[3]) diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py deleted file mode 100644 index 93986f4f4d260f7c92473a5cd5909547da690e4f..0000000000000000000000000000000000000000 --- a/scripts/ML/loaders.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy -import random -import torch - -def set_random(): - seed_value = 42 - random.seed(seed_value) - numpy.random.seed(seed_value) - torch.manual_seed(seed_value) - torch.cuda.manual_seed_all(seed_value) diff --git a/scripts/ML/predictMulti.py b/scripts/ML/predictMulti.py index 80d7eaa76b32baf8867cf6bdf64b87c261c7a659..93ab6ab2ac3d390d4f1cfa1b1cd938ef66e6ec39 100755 --- a/scripts/ML/predictMulti.py +++ b/scripts/ML/predictMulti.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from BERT import Classifier -from Corpus import corpus -import GEODE.discursive as discursive +from GEODE import corpus, discursiveFunctions, toTSV import pandas from sys import argv @@ -10,12 +10,12 @@ def rateClass(name, answer, score): def combine(row): classes = [(name, row[name], row[name + 'Score']) - for name in discursive.functions] + for name in discursiveFunctions] return max(classes, key=lambda c: rateClass(*c))[0] def label(modelsRoot, source): records = pandas.DataFrame(source.get_all('key')) - for name in discursive.functions: + for name in discursiveFunctions: classify = Classifier(f"{modelsRoot}/{name}") content = source.get_all('content') records[name], records[name + 'Score'] = classify(content) @@ -23,4 +23,4 @@ def label(modelsRoot, source): return records if __name__ == '__main__': - label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False) + toTSV(argv[3], label(argv[1], corpus(argv[2]))) diff --git a/scripts/ML/predictSimple.py b/scripts/ML/predictSimple.py index 5ba28ad33027f8cbd7d59a9a1491be906b05b9c6..3f511c8d83505024c7100979814197a60b4bb238 100755 --- a/scripts/ML/predictSimple.py +++ b/scripts/ML/predictSimple.py @@ -1,7 +1,8 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from BERT import Classifier +from GEODE import corpus, toTSV import pandas -from Corpus import corpus from sys import argv def label(classify, source, name='label'): @@ -26,6 +27,4 @@ def label(classify, source, name='label'): return records if __name__ == '__main__': - classify = Classifier(argv[1]) - source = corpus(argv[2]) - label(classify, source).to_csv(argv[3], sep='\t', index=False) + toTSV(argv[3], label(Classifier(argv[1]), corpus(argv[2]))) diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs index d26e850f4da9c39dd6e46262436ab56a17ed6db1..c37c505eea29351e8d23dae1a9905214cb62b413 100755 --- a/scripts/ML/prodigy-jsonl-to-tsv.hs +++ b/scripts/ML/prodigy-jsonl-to-tsv.hs @@ -1,17 +1,38 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} -import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode) -import Data.Aeson.Types (prependFailure, typeMismatch) +import Control.Applicative ((<|>)) +import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT) +import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode) +import Data.Aeson.Types (Parser, prependFailure, typeMismatch) import Data.ByteString.Lazy as BS (null, readFile, split) import Data.ByteString.Lazy.Char8 as BS (unpack) +import Data.String (IsString(..)) +import Data.Text as Text (Text) import Data.Vector as Vector (head) -import GEODE.Metadata (type (@)(..), tsvFile) +import GEODE.Metadata (type (@)(..), Record(..), tsvFile) import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph) import System.Environment (getArgs) import System.Script (try, syntax, warn) +import Text.Printf (printf) data Row = Unclassified String | Full ClassifiedParagraph instance {-# OVERLAPS #-} FromJSON Row where +{- + parseJSON o@(Object v) = do + paragraphMeta@(paragraphRecord :@: _) <- v .: "meta" + --classified <- v .: "accept" >>= parseClassification + classified <- v .: "label" + pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified + where + parseClassification = withArray "Classification" singleValue + singleValue a + | not $ Prelude.null a = + withText "domain" (pure . Right . Classification) (Vector.head a) + singleValue _ = pure $ Left + ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now") + debug record = + "Looks like " ++ uid record ++ " was not classified, ignoring for now" + parseJSON o@(Object v) = do paragraphMeta <- v .: "meta" >>= parseJSON classified <- v .: "accept" >>= parseClassification @@ -24,10 +45,47 @@ instance {-# OVERLAPS #-} FromJSON Row where singleValue _ = pure $ Left ("Looks like " ++ debug ++ " was not classified, ignoring for now") debug = BS.unpack $ encode o +-} + + parseJSON = withObject "Row" parseRow + where + parseRow o = do + paragraphMeta <- o .: "meta" + getRow paragraphMeta + <$> runExceptT (classification o) + getRow paragraphMeta@(paragraphRecord :@: _) = either + (Unclassified . debug paragraphRecord) + (Full . (paragraphMeta :@:) . Classification) + classification :: Object -> ExceptT String Parser Text + classification o = do + getTextField "answer" o >>= isAccept + getTextField "label" + --o .: "label" >>= withText "label" pure + --checkAnswer o = ExceptT + -- ((o .: "answer" >>= withText "answer" (pure . isAccept)) + -- <|> pure (Left "answer field is missing")) + isAccept "accept" = pure () + isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s + --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s + debug record = printf "Ignoring %s: %s" (uid record) + +getTextField :: String -> Object -> ExceptT String Parser Text +getTextField name o = getField >>= ensureIsText + where + getField :: ExceptT String Parser Value + getField = ExceptT $ + (Right <$> (o .: fromString name)) <|> catch "is missing" + ensureIsText :: Value -> ExceptT String Parser Text + ensureIsText v = ExceptT $ + withText name (pure . Right) v <|> catch "is not text" + catch :: String -> Parser (Either String a) + catch = pure . Left . printf "%s field %s" name +{- parseJSON invalid = - prependFailure "parsing ClassifiedParagraph failed, " + prependFailure "parsing Row failed, " (typeMismatch "Object" invalid) +-} logIgnored :: [Row] -> IO [ClassifiedParagraph] logIgnored = foldr keepFull (pure []) diff --git a/scripts/ML/prodigy-tsv-to-jsonl.hs b/scripts/ML/prodigy-tsv-to-jsonl.hs index 51cf7a6214248cbcb447e5dc2c3059fff2b73968..8d6e4210f9f8098fb03a342e6f3637caf04091ed 100755 --- a/scripts/ML/prodigy-tsv-to-jsonl.hs +++ b/scripts/ML/prodigy-tsv-to-jsonl.hs @@ -1,12 +1,15 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding) import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) import Data.Text (Text) import Data.Text.IO as Text (readFile) -import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv) -import GEODE.Metadata.ProdigyMeta - (Classification(..), ClassifiedParagraph, ParagraphMeta) +import GEODE.Metadata + ( type (@)(..), DefaultFields(..), HasDefaultHeader(..), Record(..) + , readNamedTsv ) +import GEODE.Metadata.ProdigyMeta (ParagraphMeta) +-- (Classification(..), ClassifiedParagraph, ParagraphMeta) import GHC.Generics (Generic) import System.Environment (getArgs) import System.FilePath ((</>)) @@ -15,21 +18,51 @@ import System.Script (syntax, try) data Paragraph = Paragraph { text :: Text , meta :: ParagraphMeta - , accept :: [Text] } deriving Generic + --, label :: Text + } deriving Generic + +newtype DatasetContent = DatasetContent { content :: Text } deriving Generic + +instance ToNamedRecord DatasetContent +instance FromNamedRecord DatasetContent +instance HasDefaultHeader DatasetContent where + defaultFields = DefaultFields ["content"] + +type DatasetRow = ParagraphMeta @ DatasetContent + +{- +data Paragraph = Paragraph + { text :: Text + , meta :: ParagraphMeta + , accept :: [Text] + } deriving Generic +-} instance ToJSON Paragraph where toEncoding = genericToEncoding defaultOptions -loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph -loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do +--loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph +fromFile :: FilePath -> ParagraphMeta -> IO Paragraph +--loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do +fromFile source meta@(paragraphRecord :@: _) = do text <- Text.readFile (source </> relativePath paragraphRecord "txt") - pure $ Paragraph {text, meta, accept = [paragraphFunction classification]} + --pure $ Paragraph {text, meta, accept = []} + --pure $ Paragraph {text, meta, accept = [paragraphFunction classification], answer = "accept"} + --pure $ Paragraph {text, meta, label = paragraphFunction classification} + pure $ Paragraph {text, meta} + +fromRow :: DatasetRow -> IO Paragraph +fromRow (meta :@: (DatasetContent {content})) = + pure $ Paragraph {text = content, meta} main :: IO () main = getArgs >>= run where - run [inputMeta, source] = - try (readNamedTsv inputMeta) >>= mapM_ (printJSON source) - run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY" - printJSON source parMeta = - loadParagraph source parMeta >>= ByteString.putStrLn . encode + run [dataset] = f fromRow dataset + run [inputMeta, source] = f (fromFile source) inputMeta +-- try (readNamedTsv inputMeta) >>= mapM_ (printJSON source) + run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY | TSV_DATASET" + f loader input = + try (readNamedTsv input) >>= mapM_ (\row -> loader row >>= ByteString.putStrLn . encode) + --printJSON source parMeta = + -- loadParagraph source parMeta >>= ByteString.putStrLn . encode diff --git a/scripts/ML/prodigyAcceptedJSONLToTSV.py b/scripts/ML/prodigyAcceptedJSONLToTSV.py index 2395caa68b01a53e89d72b50c9b6bb1a9f96fca1..d0fd491c38e3da48a4873ca0e7a577d25c743f6f 100755 --- a/scripts/ML/prodigyAcceptedJSONLToTSV.py +++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 -from GEODE import toKey +from GEODE import toKey, toTSV import pandas import JSONL import sys -def tsv_row(annotation): +def toRow(annotation): return {'work': annotation['meta']['work'], 'volume': annotation['meta']['volume'], 'article': annotation['meta']['article'], @@ -15,9 +15,9 @@ def tsv_row(annotation): } def acceptedToTSV(inputJSONL, outputTSV): - annotations = pandas.DataFrame( - sorted([tsv_row(a) for a in inputJSONL], key=toKey)) - annotations.to_csv(outputTSV, sep='\t', index=False) + toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept']) + #toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'], + # sortBy=None) if __name__ == '__main__': acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2]) diff --git a/scripts/ML/prodigyMultiJSONLToDirectory.py b/scripts/ML/prodigyMultiJSONLToDirectory.py index cf3ccae84f1cf3471ee3978fd0dc54a59cdad64c..0c1227e031cd8290615db2f4606f2e96f2b933a2 100755 --- a/scripts/ML/prodigyMultiJSONLToDirectory.py +++ b/scripts/ML/prodigyMultiJSONLToDirectory.py @@ -1,16 +1,12 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S PYTHONPATH=lib/python python3 -from Corpus import Directory -from GEODE import toKey, uid -import GEODE.discursive as discursive -from GEODE.util import initialise -import pandas +from GEODE import Directory, discursiveFunctions, toKey, toTSV, uid +from split import initialise, toIterator +from split.Error import Contradiction, NoLabelLeft, TwoAnnotations, UnknownAnswer +from unbalanceLimiter import unbalanceLimiter import JSONL import sys -def subDict(d, keys): - return {key: d[key] for key in keys} - def initialiseTexts(texts, key, annotation): initialise(texts, key, @@ -26,7 +22,7 @@ def byLabel(annotations): answer = annotation['answer'] initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []}) if answer not in labels[label]: - print(f"Unsupported answer '{answer}' for annotation {annotation}") + UnknownAnswer(annotation, answer) else: labels[label][answer].append(annotation) return labels @@ -34,7 +30,6 @@ def byLabel(annotations): def erase(texts, error, key, reason): error[key] = texts[key]['row'] del texts[key] - print(reason) def accept(texts, errors, label, accepted): for annotation in accepted: @@ -43,8 +38,8 @@ def accept(texts, errors, label, accepted): initialiseTexts(texts, key, annotation) previous = texts[key]['accept'] if previous is not None: - reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'" - erase(texts, errors, key, reason) + TwoAnnotations(annotation, previous, label) + erase(texts, errors, key) else: texts[key]['accept'] = label @@ -55,19 +50,21 @@ def reject(texts, errors, label, rejected): initialiseTexts(texts, key, annotation) previous = texts[key]['accept'] if previous is not None and previous == label: - erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected") + Contradiction(annotation, label) + erase(texts, errors, key) else: texts[key]['reject'].add(label) def checkRejects(texts, errors): for key, text in texts.items(): countRejected = len(text['reject']) - countFunctions = len(discursive.functions) + countFunctions = len(discursiveFunctions) if countRejected == countFunctions: - reason = f"No possible function left for {uid(text['row'])}" - erase(texts, errors, key, reason) + NoLabelLeft(text['row']) + erase(texts, errors, key) elif text['accept'] is None and countRejected == countFunctions - 1: - text['accept'] = discursive.functions.difference(text['reject']).pop() + free = set(discursiveFunctions).difference(text['reject']) + text['accept'] = free.pop() print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected") def byText(byLabelAnnotations): @@ -79,19 +76,9 @@ def byText(byLabelAnnotations): checkRejects(texts, errors) return texts.values(), errors.values() -def toTsv(filePath, data): - rows = sorted(data, key=toKey) - pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False) - -def toIterator(*args): - for arg in args: - for elem in arg: - yield elem - def exportCorpus(rootDirectory, texts, errors): corpus = Directory(rootDirectory) - corpus.save(sorted(toIterator([t['row'] for t in texts], errors), - key=toKey)) + corpus.save(toIterator([t['row'] for t in texts], errors)) def indexByKey(annotations): return {toKey(annotation['meta']): annotation for annotation in annotations} @@ -110,16 +97,19 @@ def toRow(answer): def exportLabels(rootDirectory, labels): for label, answers in labels.items(): - toTsv(f"{rootDirectory}/{label}.tsv", - toIterator(map(toRow('accept'), answers['accept']), - map(toRow('reject'), allRejects(labels, label)))) + toTSV(f"{rootDirectory}/{label}.tsv", + unbalanceLimiter( + toIterator(map(toRow('accept'), answers['accept']), + map(toRow('reject'), allRejects(labels, label))), + maxRatio=4, + attribute='answer')) def multiJSONLToDirectory(jsonl, outputDirectory): byLabelAnnotations = byLabel(jsonl) texts, errors = byText(byLabelAnnotations) exportCorpus(outputDirectory, texts, errors) if len(errors) > 0: - toTsv(f"{outputDirectory}/errors.tsv", errors) + toTSV(f"{outputDirectory}/errors.tsv", errors) exportLabels(outputDirectory, byLabelAnnotations) if __name__ == '__main__': diff --git a/scripts/ML/simpleTrainOfMulti.py b/scripts/ML/simpleTrainOfMulti.py deleted file mode 100755 index 5f80001e4d1ea416f1f3dd86c82783ff8dc44b93..0000000000000000000000000000000000000000 --- a/scripts/ML/simpleTrainOfMulti.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -from Corpus import Directory, SelfContained -from GEODE import fromKey, toKey -import GEODE.discursive as discursive -from prodigyAcceptedJSONLToTSV import acceptedToTSV -from sys import argv - -def isAccepted(key, row): - return row['answer'] == 'accept' - -def withLabel(corpus, label): - return lambda key, row: dict(**corpus.full(key, row) - , paragraphFunction=label) - -def simpleTrainOfMulti(multiDirectory, outputTSV): - annotations = [] - for className in discursive.functions: - corpus = Directory(multiDirectory, tsv_filename=className) - p = withLabel(corpus, className) - annotations += list(corpus.get_all(projector=p, where=isAccepted)) - output = SelfContained(outputTSV) - output.save(sorted(annotations, key=toKey)) - -if __name__ == '__main__': - simpleTrainOfMulti(argv[1], argv[2]) diff --git a/scripts/ML/splitMulti.py b/scripts/ML/splitMulti.py index d5cd2a70d5ea95c1d58e624d054cdfb76df77c3e..9de14234fc9c6f2426b0764cb8ca30030bb535fb 100755 --- a/scripts/ML/splitMulti.py +++ b/scripts/ML/splitMulti.py @@ -1,13 +1,11 @@ -#!/usr/bin/env python3 -from Corpus import Directory +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + from GEODE import toKey -from GEODE.Error import TwoAnnotations -from GEODE.util import initialise, parseRatio +from split import initialise, parseRatio +from split.Error import TwoAnnotations import JSONL from random import shuffle from sys import argv, stdin -from prodigyAcceptedJSONLToTSV import acceptedToTSV -from prodigyMultiJSONLToDirectory import multiJSONLToDirectory def getTexts(inputJSONL): texts = {} @@ -16,19 +14,20 @@ def getTexts(inputJSONL): key = toKey(annotation['meta']) if key not in errors: initialise(texts, key, {'accept': None, 'reject': []}) - if annotation['answer'] == 'accept': - previous = texts[key]['accept'] - if previous is None: - texts[key]['accept'] = annotation - else: - print(TwoAnnotations(annotations['meta'], - previous['label'], - texts[key]['label'])) - errors.add(key) - else: - texts[key]['reject'].append(annotation) + sortByAnswer(texts, errors, key, annotation) return texts +def sortByAnswer(texts, errors, key, annotation): + if annotation['answer'] == 'accept': + previous = texts[key]['accept'] + if previous is None: + texts[key]['accept'] = annotation + else: + TwoAnnotations(annotation, previous['label'], texts[key]['label']) + errors.add(key) + else: + texts[key]['reject'].append(annotation) + def getTest(texts, trainRatio): accepted = [key for key, t in texts.items() if t['accept'] is not None] shuffle(accepted) @@ -42,16 +41,22 @@ def allAnnotations(text): return [text['accept']] + text['reject'] def getTrain(texts, test): - return [annotation - for key in sorted(texts.keys()) if key not in test - for annotation in allAnnotations(texts[key])] + train = [] + waste = [] + for key in sorted(texts.keys()): + if key not in test: + train += allAnnotations(texts[key]) + else: + waste += texts[key]['reject'] + return train, waste def splitMulti(jsonl, trainRatio, trainOutput, testOutput): texts = getTexts(jsonl) test = getTest(texts, trainRatio) - train = getTrain(texts, test) - multiJSONLToDirectory(train, trainOutput) - acceptedToTSV(test.values(), testOutput) + train, waste = getTrain(texts, test) + print(f"{len(waste)} negative annotations about texts in the test set have been discarded from the training one") + JSONL.save(trainOutput, train) + JSONL.save(testOutput, test.values()) if __name__ == '__main__': splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3]) diff --git a/scripts/ML/train.py b/scripts/ML/train.py deleted file mode 100755 index 95b812f79dde5dcaee522849126dd8188407a731..0000000000000000000000000000000000000000 --- a/scripts/ML/train.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python3 -from BERT import Trainer -from LabeledData import LabeledData -import sys - -if __name__ == '__main__': - labeled_data = LabeledData(sys.argv[1]) - trainer = Trainer(sys.argv[2], labeled_data) - trainer() diff --git a/scripts/ML/trainMultiBERT.py b/scripts/ML/trainMultiBERT.py index 43cc10b002f5dd35624e3726a3bfa6df277c1834..f8eae396cf53a63aef641967e26951edf6654876 100755 --- a/scripts/ML/trainMultiBERT.py +++ b/scripts/ML/trainMultiBERT.py @@ -1,8 +1,7 @@ -#!/usr/bin/env python3 -from BERT import BERT, Trainer -from Corpus import Directory -import GEODE.discursive as discursive -from LabeledData import LabeledData +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from BERT import BERT, LabeledData, Trainer +from GEODE import Directory, discursiveFunctions import os import sys @@ -20,5 +19,5 @@ def trainSubClassifier(trainRoot, modelRoot, className): trainer() if __name__ == '__main__': - for className in discursive.functions: + for className in discursiveFunctions: trainSubClassifier(sys.argv[1], sys.argv[2], className) diff --git a/scripts/ML/trainSimpleBERT.py b/scripts/ML/trainSimpleBERT.py index d869b4a8b0f73743515f5c6e5824a5aa267475c1..28107d3548707bf8952318dda5cd0386e3e7517a 100755 --- a/scripts/ML/trainSimpleBERT.py +++ b/scripts/ML/trainSimpleBERT.py @@ -1,10 +1,13 @@ -#!/usr/bin/env python3 -from Corpus import corpus -from BERT import Trainer -from LabeledData import LabeledData +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from BERT import Trainer, LabeledData +from GEODE import corpus +import os import sys if __name__ == '__main__': labeled_data = LabeledData(corpus(sys.argv[1]), "paragraphFunction") - trainer = Trainer(sys.argv[2], labeled_data) + modelPath = sys.argv[2] + os.makedirs(modelPath, exist_ok=True) + trainer = Trainer(modelPath, labeled_data) trainer() diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh index ba74ce0bcba69b85133be770bd252017ebd09284..f89f8859c07816606f44d45d35c18587517b7153 100755 --- a/scripts/extract-from-source.sh +++ b/scripts/extract-from-source.sh @@ -1,8 +1,8 @@ #!/bin/sh -BASE_DIR="${0%/*}" +BASE_DIR="${0%%/*}" -source ${BASE_DIR}/lib.sh +source ${BASE_DIR}/lib/bash.sh if [ "$#" != 2 ] then @@ -16,5 +16,5 @@ fi FILES_TSV="${TARGET}/files.tsv" printf "book tome rank headWord name page\n" > "${FILES_TSV}" -${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" -${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" +${BASE_DIR}/scripts/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" +${BASE_DIR}/scripts/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"