Compare revisions

3972fb64 · 997682af · 997682af · 3972fb64 · 997682af · 997682af
--- a/scripts/ML/GEODE/Error.py
+++ b/scripts/ML/GEODE/Error.py
-from GEODE import uid
-
-def TwoAnnotations(text, first, second):
-    textUID = text if type(text) == str else uid(text)
-    return f"Found two annotations for {textUID}: '{first}' and '{second}'"
--- a/scripts/ML/convert-corpus.py
+++ b/scripts/ML/convert-corpus.py
-#!/usr/bin/env python3
-from Corpus import corpus
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+from GEODE import corpus
 import sys

 if __name__ == '__main__':

--- a/scripts/ML/evaluate.py
+++ b/scripts/ML/evaluate.py
-#!/usr/bin/env python3
-from EDdA.classification import heatmap
-from EDdA.store import preparePath
-import GEODE.discursive as discursive
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from GEODE.Visualisation import heatmap
+from GEODE.Classification import discursiveFunctions, knowledgeDomains
 import pandas
 from sklearn.metrics import classification_report, confusion_matrix
 from sys import argv
@@ -9,11 +9,13 @@ from sys import argv
 def evaluate(truth, predictions, outputDirectory):
    matrix = confusion_matrix(truth,
                              predictions,
-                              labels=list(discursive.functions),
+                              labels=knowledgeDomains,
+                              #labels=discursiveFunctions
                              normalize='true')
    heatmap(matrix,
-            preparePath(f"{outputDirectory}/confusion.png"),
-            labels=discursive.functions)
+            f"{outputDirectory}/confusion.png",
+            labels=knowledgeDomains)
+            #labels=discursiveFunctions)
    with open(f"{outputDirectory}/report.json", 'w') as json:
        print(classification_report(truth, predictions, output_dict=True),
              file=json)
@@ -24,4 +26,4 @@ def evaluate(truth, predictions, outputDirectory):
 if __name__ == '__main__':
    truth = pandas.read_csv(argv[1], sep='\t')
    predictions = pandas.read_csv(argv[2], sep='\t')
-    evaluate(truth['paragraphFunction'], predictions['label'], argv[3])
+    evaluate(truth['super_domain'], predictions['label'], argv[3])
--- a/scripts/ML/loaders.py
+++ b/scripts/ML/loaders.py
-import numpy
-import random
-import torch
-
-def set_random():
-    seed_value = 42
-    random.seed(seed_value)
-    numpy.random.seed(seed_value)
-    torch.manual_seed(seed_value)
-    torch.cuda.manual_seed_all(seed_value)
--- a/scripts/ML/predictMulti.py
+++ b/scripts/ML/predictMulti.py
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from BERT import Classifier
-from Corpus import corpus
-import GEODE.discursive as discursive
+from GEODE import corpus, discursiveFunctions, toTSV
 import pandas
 from sys import argv

@@ -10,12 +10,12 @@ def rateClass(name, answer, score):

 def combine(row):
    classes = [(name, row[name], row[name + 'Score'])
-               for name in discursive.functions]
+               for name in discursiveFunctions]
    return max(classes, key=lambda c: rateClass(*c))[0]

 def label(modelsRoot, source):
    records = pandas.DataFrame(source.get_all('key'))
-    for name in discursive.functions:
+    for name in discursiveFunctions:
        classify = Classifier(f"{modelsRoot}/{name}")
        content = source.get_all('content')
        records[name], records[name + 'Score'] = classify(content)
@@ -23,4 +23,4 @@ def label(modelsRoot, source):
    return records

 if __name__ == '__main__':
-    label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False)
+    toTSV(argv[3], label(argv[1], corpus(argv[2])))
--- a/scripts/ML/predictSimple.py
+++ b/scripts/ML/predictSimple.py
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from BERT import Classifier
+from GEODE import corpus, toTSV
 import pandas
-from Corpus import corpus
 from sys import argv

 def label(classify, source, name='label'):
@@ -26,6 +27,4 @@ def label(classify, source, name='label'):
    return records

 if __name__ == '__main__':
-    classify = Classifier(argv[1])
-    source = corpus(argv[2])
-    label(classify, source).to_csv(argv[3], sep='\t', index=False)
+    toTSV(argv[3], label(Classifier(argv[1]), corpus(argv[2])))
--- a/scripts/ML/prodigy-jsonl-to-tsv.hs
+++ b/scripts/ML/prodigy-jsonl-to-tsv.hs
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
 {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
-import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
-import Data.Aeson.Types (prependFailure, typeMismatch)
+import Control.Applicative ((<|>))
+import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT)
+import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode)
+import Data.Aeson.Types (Parser, prependFailure, typeMismatch)
 import Data.ByteString.Lazy as BS (null, readFile, split)
 import Data.ByteString.Lazy.Char8 as BS (unpack)
+import Data.String (IsString(..))
+import Data.Text as Text (Text)
 import Data.Vector as Vector (head)
-import GEODE.Metadata (type (@)(..), tsvFile)
+import GEODE.Metadata (type (@)(..), Record(..), tsvFile)
 import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
 import System.Environment (getArgs)
 import System.Script (try, syntax, warn)
+import Text.Printf (printf)

 data Row = Unclassified String | Full ClassifiedParagraph
 instance {-# OVERLAPS #-} FromJSON Row where
+{-
+  parseJSON o@(Object v) = do
+    paragraphMeta@(paragraphRecord :@: _) <- v .: "meta"
+    --classified <- v .: "accept" >>= parseClassification
+    classified <- v .: "label"
+    pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified
+    where
+      parseClassification = withArray "Classification" singleValue
+      singleValue a
+        | not $ Prelude.null a =
+          withText "domain" (pure . Right . Classification) (Vector.head a)
+      singleValue _ = pure $ Left
+        ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now")
+      debug record =
+        "Looks like " ++ uid record ++ " was not classified, ignoring for now"
+
  parseJSON o@(Object v) = do
    paragraphMeta <- v .: "meta" >>= parseJSON
    classified <- v .: "accept" >>= parseClassification
@@ -24,10 +45,47 @@ instance {-# OVERLAPS #-} FromJSON Row where
      singleValue _ = pure $ Left
        ("Looks like " ++ debug ++ " was not classified, ignoring for now")
      debug = BS.unpack $ encode o
+-}
+
+  parseJSON = withObject "Row" parseRow
+    where
+      parseRow o = do
+        paragraphMeta <- o .: "meta"
+        getRow paragraphMeta
+          <$> runExceptT (classification o)
+      getRow paragraphMeta@(paragraphRecord :@: _) = either
+        (Unclassified . debug paragraphRecord)
+        (Full . (paragraphMeta :@:) . Classification)
+      classification :: Object -> ExceptT String Parser Text
+      classification o = do
+        getTextField "answer" o >>= isAccept
+        getTextField "label"
+        --o .: "label" >>= withText "label" pure
+      --checkAnswer o = ExceptT
+      --  ((o .: "answer" >>= withText "answer" (pure . isAccept))
+      --    <|> pure (Left "answer field is missing"))
+      isAccept "accept" = pure ()
+      isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s
+      --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s
+      debug record = printf "Ignoring %s: %s" (uid record)
+
+getTextField :: String -> Object -> ExceptT String Parser Text
+getTextField name o = getField >>= ensureIsText
+  where
+    getField :: ExceptT String Parser Value
+    getField = ExceptT $
+      (Right <$> (o .: fromString name)) <|> catch "is missing"
+    ensureIsText :: Value -> ExceptT String Parser Text
+    ensureIsText v = ExceptT $
+      withText name (pure . Right) v <|> catch "is not text"
+    catch :: String -> Parser (Either String a)
+    catch = pure . Left . printf "%s field %s" name

+{-
  parseJSON invalid =
-    prependFailure "parsing ClassifiedParagraph failed, "
+    prependFailure "parsing Row failed, "
    (typeMismatch "Object" invalid)
+-}

 logIgnored :: [Row] -> IO [ClassifiedParagraph]
 logIgnored = foldr keepFull (pure [])

--- a/scripts/ML/prodigy-tsv-to-jsonl.hs
+++ b/scripts/ML/prodigy-tsv-to-jsonl.hs
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
 import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding)
 import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
+import Data.Csv (FromNamedRecord(..), ToNamedRecord(..))
 import Data.Text (Text)
 import Data.Text.IO as Text (readFile)
-import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
-import GEODE.Metadata.ProdigyMeta
-  (Classification(..), ClassifiedParagraph, ParagraphMeta)
+import GEODE.Metadata
+  ( type (@)(..), DefaultFields(..), HasDefaultHeader(..), Record(..)
+  , readNamedTsv )
+import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
+--  (Classification(..), ClassifiedParagraph, ParagraphMeta)
 import GHC.Generics (Generic)
 import System.Environment (getArgs)
 import System.FilePath ((</>))
@@ -15,21 +18,51 @@ import System.Script (syntax, try)
 data Paragraph = Paragraph
  { text :: Text
  , meta :: ParagraphMeta
-  , accept :: [Text] } deriving Generic
+  --, label :: Text
+  } deriving Generic
+
+newtype DatasetContent = DatasetContent { content :: Text } deriving Generic
+
+instance ToNamedRecord DatasetContent
+instance FromNamedRecord DatasetContent
+instance HasDefaultHeader DatasetContent where
+  defaultFields = DefaultFields ["content"]
+
+type DatasetRow = ParagraphMeta @ DatasetContent
+
+{-
+data Paragraph = Paragraph
+  { text :: Text
+  , meta :: ParagraphMeta
+  , accept :: [Text]
+  } deriving Generic
+-}

 instance ToJSON Paragraph where
  toEncoding = genericToEncoding defaultOptions

-loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
-loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
+--loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
+fromFile :: FilePath -> ParagraphMeta -> IO Paragraph
+--loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
+fromFile source meta@(paragraphRecord :@: _) = do
  text <- Text.readFile (source </> relativePath paragraphRecord "txt")
-  pure $ Paragraph {text, meta, accept = [paragraphFunction classification]}
+  --pure $ Paragraph {text, meta, accept = []}
+  --pure $ Paragraph {text, meta, accept = [paragraphFunction classification], answer = "accept"}
+  --pure $ Paragraph {text, meta, label = paragraphFunction classification}
+  pure $ Paragraph {text, meta}
+
+fromRow :: DatasetRow -> IO Paragraph
+fromRow (meta :@: (DatasetContent {content})) =
+  pure $ Paragraph {text = content, meta}

 main :: IO ()
 main = getArgs >>= run
  where
-    run [inputMeta, source] =
-        try (readNamedTsv inputMeta) >>= mapM_ (printJSON source)
-    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY"
-    printJSON source parMeta =
-      loadParagraph source parMeta >>= ByteString.putStrLn . encode
+    run [dataset] = f fromRow dataset
+    run [inputMeta, source] = f (fromFile source) inputMeta
+--        try (readNamedTsv inputMeta) >>= mapM_ (printJSON source)
+    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY | TSV_DATASET"
+    f loader input =
+      try (readNamedTsv input) >>= mapM_ (\row -> loader row >>= ByteString.putStrLn . encode)
+    --printJSON source parMeta =
+    --  loadParagraph source parMeta >>= ByteString.putStrLn . encode
--- a/scripts/ML/prodigyAcceptedJSONLToTSV.py
+++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3

-from GEODE import toKey
+from GEODE import toKey, toTSV
 import pandas
 import JSONL
 import sys

-def tsv_row(annotation):
+def toRow(annotation):
    return {'work': annotation['meta']['work'],
            'volume': annotation['meta']['volume'],
            'article': annotation['meta']['article'],
@@ -15,9 +15,9 @@ def tsv_row(annotation):
            }

 def acceptedToTSV(inputJSONL, outputTSV):
-    annotations = pandas.DataFrame(
-            sorted([tsv_row(a) for a in inputJSONL], key=toKey))
-    annotations.to_csv(outputTSV, sep='\t', index=False)
+    toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'])
+    #toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'],
+    #      sortBy=None)

 if __name__ == '__main__':
    acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2])
--- a/scripts/ML/prodigyMultiJSONLToDirectory.py
+++ b/scripts/ML/prodigyMultiJSONLToDirectory.py
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3

-from Corpus import Directory
-from GEODE import toKey, uid
-import GEODE.discursive as discursive
-from GEODE.util import initialise
-import pandas
+from GEODE import Directory, discursiveFunctions, toKey, toTSV, uid
+from split import initialise, toIterator
+from split.Error import Contradiction, NoLabelLeft, TwoAnnotations, UnknownAnswer
+from unbalanceLimiter import unbalanceLimiter
 import JSONL
 import sys

-def subDict(d, keys):
-    return {key: d[key] for key in keys}
-
 def initialiseTexts(texts, key, annotation):
    initialise(texts,
               key,
@@ -26,7 +22,7 @@ def byLabel(annotations):
        answer = annotation['answer']
        initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
        if answer not in labels[label]:
-            print(f"Unsupported answer '{answer}' for annotation {annotation}")
+            UnknownAnswer(annotation, answer)
        else:
            labels[label][answer].append(annotation)
    return labels
@@ -34,7 +30,6 @@ def byLabel(annotations):
 def erase(texts, error, key, reason):
    error[key] = texts[key]['row']
    del texts[key]
-    print(reason)

 def accept(texts, errors, label, accepted):
    for annotation in accepted:
@@ -43,8 +38,8 @@ def accept(texts, errors, label, accepted):
            initialiseTexts(texts, key, annotation)
            previous = texts[key]['accept']
            if previous is not None:
-                reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
-                erase(texts, errors, key, reason)
+                TwoAnnotations(annotation, previous, label)
+                erase(texts, errors, key)
            else:
                texts[key]['accept'] = label

@@ -55,19 +50,21 @@ def reject(texts, errors, label, rejected):
            initialiseTexts(texts, key, annotation)
            previous = texts[key]['accept']
            if previous is not None and previous == label:
-                erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
+                Contradiction(annotation, label)
+                erase(texts, errors, key)
            else:
                texts[key]['reject'].add(label)

 def checkRejects(texts, errors):
    for key, text in texts.items():
        countRejected = len(text['reject'])
-        countFunctions = len(discursive.functions)
+        countFunctions = len(discursiveFunctions)
        if countRejected == countFunctions:
-            reason = f"No possible function left for {uid(text['row'])}"
-            erase(texts, errors, key, reason)
+            NoLabelLeft(text['row'])
+            erase(texts, errors, key)
        elif text['accept'] is None and countRejected == countFunctions - 1:
-            text['accept'] = discursive.functions.difference(text['reject']).pop()
+            free = set(discursiveFunctions).difference(text['reject'])
+            text['accept'] = free.pop()
            print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")

 def byText(byLabelAnnotations):
@@ -79,19 +76,9 @@ def byText(byLabelAnnotations):
    checkRejects(texts, errors)
    return texts.values(), errors.values()

-def toTsv(filePath, data):
-    rows = sorted(data, key=toKey)
-    pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)
-
-def toIterator(*args):
-    for arg in args:
-        for elem in arg:
-            yield elem
-
 def exportCorpus(rootDirectory, texts, errors):
    corpus = Directory(rootDirectory)
-    corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
-                       key=toKey))
+    corpus.save(toIterator([t['row'] for t in texts], errors))

 def indexByKey(annotations):
    return {toKey(annotation['meta']): annotation for annotation in annotations}
@@ -110,16 +97,19 @@ def toRow(answer):

 def exportLabels(rootDirectory, labels):
    for label, answers in labels.items():
-        toTsv(f"{rootDirectory}/{label}.tsv",
-              toIterator(map(toRow('accept'), answers['accept']),
-                         map(toRow('reject'), allRejects(labels, label))))
+        toTSV(f"{rootDirectory}/{label}.tsv",
+              unbalanceLimiter(
+                  toIterator(map(toRow('accept'), answers['accept']),
+                             map(toRow('reject'), allRejects(labels, label))),
+                  maxRatio=4,
+                  attribute='answer'))

 def multiJSONLToDirectory(jsonl, outputDirectory):
    byLabelAnnotations = byLabel(jsonl)
    texts, errors = byText(byLabelAnnotations)
    exportCorpus(outputDirectory, texts, errors)
    if len(errors) > 0:
-        toTsv(f"{outputDirectory}/errors.tsv", errors)
+        toTSV(f"{outputDirectory}/errors.tsv", errors)
    exportLabels(outputDirectory, byLabelAnnotations)

 if __name__ == '__main__':

--- a/scripts/ML/simpleTrainOfMulti.py
+++ b/scripts/ML/simpleTrainOfMulti.py
-#!/usr/bin/env python3
-
-from Corpus import Directory, SelfContained
-from GEODE import fromKey, toKey
-import GEODE.discursive as discursive
-from prodigyAcceptedJSONLToTSV import acceptedToTSV
-from sys import argv
-
-def isAccepted(key, row):
-    return row['answer'] == 'accept'
-
-def withLabel(corpus, label):
-    return lambda key, row: dict(**corpus.full(key, row)
-                                 , paragraphFunction=label)
-
-def simpleTrainOfMulti(multiDirectory, outputTSV):
-    annotations = []
-    for className in discursive.functions:
-        corpus = Directory(multiDirectory, tsv_filename=className)
-        p = withLabel(corpus, className)
-        annotations += list(corpus.get_all(projector=p, where=isAccepted))
-    output = SelfContained(outputTSV)
-    output.save(sorted(annotations, key=toKey))
-
-if __name__ == '__main__':
-    simpleTrainOfMulti(argv[1], argv[2])
--- a/scripts/ML/splitMulti.py
+++ b/scripts/ML/splitMulti.py
-#!/usr/bin/env python3
-from Corpus import Directory
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from GEODE import toKey
-from GEODE.Error import TwoAnnotations
-from GEODE.util import initialise, parseRatio
+from split import initialise, parseRatio
+from split.Error import TwoAnnotations
 import JSONL
 from random import shuffle
 from sys import argv, stdin
-from prodigyAcceptedJSONLToTSV import acceptedToTSV
-from prodigyMultiJSONLToDirectory import multiJSONLToDirectory

 def getTexts(inputJSONL):
    texts = {}
@@ -16,19 +14,20 @@ def getTexts(inputJSONL):
        key = toKey(annotation['meta'])
        if key not in errors:
            initialise(texts, key, {'accept': None, 'reject': []})
-            if annotation['answer'] == 'accept':
-                previous = texts[key]['accept']
-                if previous is None:
-                    texts[key]['accept'] = annotation
-                else:
-                    print(TwoAnnotations(annotations['meta'],
-                                         previous['label'],
-                                         texts[key]['label']))
-                    errors.add(key)
-            else:
-                texts[key]['reject'].append(annotation)
+            sortByAnswer(texts, errors, key, annotation)
    return texts

+def sortByAnswer(texts, errors, key, annotation):
+    if annotation['answer'] == 'accept':
+        previous = texts[key]['accept']
+        if previous is None:
+            texts[key]['accept'] = annotation
+        else:
+            TwoAnnotations(annotation, previous['label'], annotation['label'])
+            errors.add(key)
+    else:
+        texts[key]['reject'].append(annotation)
+
 def getTest(texts, trainRatio):
    accepted = [key for key, t in texts.items() if t['accept'] is not None]
    shuffle(accepted)
@@ -42,16 +41,22 @@ def allAnnotations(text):
        return [text['accept']] + text['reject']

 def getTrain(texts, test):
-    return [annotation
-            for key in sorted(texts.keys()) if key not in test
-            for annotation in allAnnotations(texts[key])]
+    train = []
+    waste = []
+    for key in sorted(texts.keys()):
+        if key not in test:
+            train += allAnnotations(texts[key])
+        else:
+            waste += texts[key]['reject']
+    return train, waste

 def splitMulti(jsonl, trainRatio, trainOutput, testOutput):
    texts = getTexts(jsonl)
    test = getTest(texts, trainRatio)
-    train = getTrain(texts, test)
-    multiJSONLToDirectory(train, trainOutput)
-    acceptedToTSV(test.values(), testOutput)
+    train, waste = getTrain(texts, test)
+    print(f"{len(waste)} negative annotations about texts in the test set have been discarded from the training one")
+    JSONL.save(trainOutput, train)
+    JSONL.save(testOutput, test.values())

 if __name__ == '__main__':
    splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3])
--- a/scripts/ML/train.py
+++ b/scripts/ML/train.py
-#!/usr/bin/env python3
-from BERT import Trainer
-from LabeledData import LabeledData
-import sys
-
-if __name__ == '__main__':
-    labeled_data = LabeledData(sys.argv[1])
-    trainer = Trainer(sys.argv[2], labeled_data)
-    trainer()
--- a/scripts/ML/trainMultiBERT.py
+++ b/scripts/ML/trainMultiBERT.py
-#!/usr/bin/env python3
-from BERT import BERT, Trainer
-from Corpus import Directory
-import GEODE.discursive as discursive
-from LabeledData import LabeledData
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from BERT import BERT, LabeledData, Trainer
+from GEODE import Directory, discursiveFunctions
 import os
 import sys

@@ -20,5 +19,5 @@ def trainSubClassifier(trainRoot, modelRoot, className):
        trainer()

 if __name__ == '__main__':
-    for className in discursive.functions:
+    for className in discursiveFunctions:
        trainSubClassifier(sys.argv[1], sys.argv[2], className)
--- a/scripts/ML/trainSimpleBERT.py
+++ b/scripts/ML/trainSimpleBERT.py
-#!/usr/bin/env python3
-from Corpus import corpus
-from BERT import Trainer
-from LabeledData import LabeledData
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from BERT import Trainer, LabeledData
+from GEODE import corpus
+import os
 import sys

 if __name__ == '__main__':
    labeled_data = LabeledData(corpus(sys.argv[1]), "paragraphFunction")
-    trainer = Trainer(sys.argv[2], labeled_data)
+    modelPath = sys.argv[2]
+    os.makedirs(modelPath, exist_ok=True)
+    trainer = Trainer(modelPath, labeled_data)
    trainer()
--- a/scripts/extract-from-source.sh
+++ b/scripts/extract-from-source.sh
 #!/bin/sh

-BASE_DIR="${0%/*}"
+BASE_DIR="${0%%/*}"

-source ${BASE_DIR}/lib.sh
+source ${BASE_DIR}/lib/bash.sh

 if [ "$#" != 2 ]
 then
@@ -16,5 +16,5 @@ fi

 FILES_TSV="${TARGET}/files.tsv"
 printf "book	tome	rank	headWord	name	page\n" > "${FILES_TSV}"
-${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}"
-${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"
+${BASE_DIR}/scripts/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}"
+${BASE_DIR}/scripts/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"
--- a/scripts/split.hs
+++ b/scripts/split.hs
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
+
+import Control.Applicative ((<**>), (<|>))
+import Data.List (foldl')
+import Data.Text (Text)
+import Data.Text.IO as Text (getContents, writeFile)
+import Options.Applicative
+  ( Parser, execParser, flag', fullDesc, help, helper, info, long, metavar
+  , progDesc, short, strArgument, strOption)
+import System.Directory (createDirectoryIfMissing)
+import System.FilePath (takeDirectory)
+import Text.Filter (Editable(..))
+import Text.Printf (printf)
+import Text.Regex.TDFA ((=~))
+
+data Mode = Discard | StartWith | EndWith
+
+data Config = Config
+  { splitPattern :: String
+  , mode :: Mode
+  , outputPattern :: String }
+
+configParser :: Parser Config
+configParser = Config
+  <$> strOption on
+  <*> (flag' StartWith startWith <|> flag' EndWith endWith <|> pure Discard)
+  <*> strArgument outputPattern
+  where
+    on = short 'o' <> long "on" <> metavar "REGEX"
+      <> help "pattern of the lines on which to split"
+    outputPattern =
+      metavar "OUTPUT_PATTERN" <> help "pattern of the output files"
+    startWith =
+      short 's' <> long "start" <> help "a part begins with the pattern"
+    endWith = short 'e' <> long "end" <> help "a part ends with the pattern"
+
+getConfig :: IO Config
+getConfig = execParser
+  (info
+    (configParser <**> helper)
+    (fullDesc
+    <> progDesc "A tool to split a textual flow on a predefined line or prefix"))
+
+split :: Config -> [Text] -> [[Text]]
+split (Config {splitPattern, mode}) = reverse . close . foldl' aggregate ([], [])
+  where
+    close (currentPart, previousParts) = reverse currentPart:previousParts
+    aggregate tmp@(currentPart, previousParts) line
+      | line =~ splitPattern =
+        case mode of
+          Discard -> ([], close tmp)
+          StartWith -> ([line], close tmp)
+          EndWith -> ([], close (line:currentPart, previousParts))
+      | otherwise = (line:currentPart, previousParts)
+
+create :: Editable a => Config -> [a] -> IO ()
+create (Config {outputPattern}) = mapM_ createFile . zip [1..] . fmap leave
+  where
+    createFile :: (Int, Text) -> IO ()
+    createFile (i, content) =
+      let path = printf outputPattern i in do
+      createDirectoryIfMissing True (takeDirectory path)
+      Text.writeFile path content
+
+main :: IO ()
+main = do
+  config <- getConfig
+  Text.getContents >>= create config . split config . enter
No results found