From f8b70bc46a7171ea50377a55e73c7f8b23f4fe70 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Thu, 4 May 2023 16:04:03 +0200
Subject: [PATCH] Increase the entropy of the universe

---
 Makefile                                      |  74 ++++++--
 lib/Conllu/Parse/Paragraph.hs                 |  28 +++
 lib/Data/BIO.hs                               |  45 +++++
 lib/Data/Csv/DB.hs                            |   9 +
 lib/Data/Metadata.hs                          |  79 +++++----
 lib/Data/Metadata/Article.hs                  |  71 ++++----
 lib/Data/Metadata/Paragraph.hs                |  40 +++++
 lib/Data/Metadata/Projector.hs                |  33 ++++
 lib/Data/Metadata/Trie.hs                     |  54 ++++++
 lib/Data/Metadata/Types.hs                    |  38 +++++
 lib/Data/SpaCy.hs                             |  16 ++
 lib/Text/Editor.bak.hs                        | 101 +++++++++++
 lib/Text/Editor.hs                            |  49 +++---
 lib/Text/Filter.hs                            |  50 ++++++
 lib/Text/TEI.hs                               |  99 +++++++++++
 manifest.scm                                  |  10 +-
 scripts/EDdA-metadata.py                      |  34 +++-
 scripts/LGE-metadata-from-EDdA.py             |  12 +-
 scripts/add-domain-metadata.hs                |  50 ++++++
 ...iclesToVolume.sh => articles-to-volume.sh} |   0
 scripts/conllu-to-lexicoscope.hs              |  95 +++++++++++
 scripts/conllu-to-txm.hs                      |  97 +++++++++++
 scripts/detokenizer.hs                        |  22 +++
 scripts/extract-corpus.sh                     |  27 +++
 scripts/extract-parallel-EDdA.sh              |   3 +-
 scripts/extract-parallel-LGE.sh               |   4 +-
 scripts/fix-tsv.hs                            |  26 +++
 scripts/inject-metadata-to-lexicoscope.hs     |  48 ++++++
 scripts/inject-metadata.sh                    |  58 +++++++
 scripts/lge-articles-to-volumes.hs            | 153 +++++++++++++++++
 scripts/linearize.hs                          |   7 +-
 scripts/parallel-links.py                     | 160 ++++++++++++++++++
 scripts/parallel-metadata.hs                  |  15 ++
 scripts/parallel-metadata.py                  |  46 +++++
 scripts/reroot.hs                             |  11 +-
 scripts/spacy-to-txm.hs                       |  65 +++++++
 scripts/stanza-annotator.py                   |  55 ------
 scripts/stanza-txt-annotator.py               |  79 +++++++++
 scripts/stanza-xml-annotator.py               |  69 ++++++++
 scripts/xml-annotator.py.bak                  |  70 ++++++++
 scripts/xml-annotator.py.remote               |  67 ++++++++
 41 files changed, 1895 insertions(+), 174 deletions(-)
 create mode 100644 lib/Conllu/Parse/Paragraph.hs
 create mode 100644 lib/Data/BIO.hs
 create mode 100644 lib/Data/Csv/DB.hs
 create mode 100644 lib/Data/Metadata/Paragraph.hs
 create mode 100644 lib/Data/Metadata/Projector.hs
 create mode 100644 lib/Data/Metadata/Trie.hs
 create mode 100644 lib/Data/Metadata/Types.hs
 create mode 100644 lib/Data/SpaCy.hs
 create mode 100644 lib/Text/Editor.bak.hs
 create mode 100644 lib/Text/Filter.hs
 create mode 100644 lib/Text/TEI.hs
 create mode 100755 scripts/add-domain-metadata.hs
 rename scripts/{articlesToVolume.sh => articles-to-volume.sh} (100%)
 create mode 100755 scripts/conllu-to-lexicoscope.hs
 create mode 100755 scripts/conllu-to-txm.hs
 create mode 100755 scripts/detokenizer.hs
 create mode 100755 scripts/extract-corpus.sh
 create mode 100755 scripts/fix-tsv.hs
 create mode 100755 scripts/inject-metadata-to-lexicoscope.hs
 create mode 100755 scripts/inject-metadata.sh
 create mode 100755 scripts/lge-articles-to-volumes.hs
 create mode 100755 scripts/parallel-links.py
 create mode 100644 scripts/parallel-metadata.hs
 create mode 100755 scripts/parallel-metadata.py
 create mode 100755 scripts/spacy-to-txm.hs
 delete mode 100755 scripts/stanza-annotator.py
 create mode 100755 scripts/stanza-txt-annotator.py
 create mode 100755 scripts/stanza-xml-annotator.py
 create mode 100755 scripts/xml-annotator.py.bak
 create mode 100755 scripts/xml-annotator.py.remote

diff --git a/Makefile b/Makefile
index c181e28..2614959 100644
--- a/Makefile
+++ b/Makefile
@@ -1,44 +1,96 @@
 CORPUS_ROOT=/home/alice/Dehors/Corpus
 EDDA=$(CORPUS_ROOT)/EDdA
 LGE=$(CORPUS_ROOT)/LGE
+PARALLEL=$(CORPUS_ROOT)/Parallel
+
+PARALLEL_LINKS=$(PARALLEL)/edda-lge.csv
 
 EDDA_META=$(EDDA)/metadata.csv
+EDDA_API=$(EDDA)/API
+EDDA_CORRESPONDANCES=$(EDDA)/ARTFL-ENCCRE.csv
+PARALLEL_EDDA=$(EDDA)/Parallel
+PARALLEL_EDDA_META=$(PARALLEL_EDDA)/metadata.csv
+RAW_PARALLEL_EDDA=$(PARALLEL)/Text/EDdA
+#RAW_PARALLEL_EDDA=$(PARALLEL_EDDA)/Text
+TEI_PARALLEL_EDDA=$(PARALLEL_EDDA)/TEI
+LINEARIZED_PARALLEL_EDDA_ROOT=$(PARALLEL_EDDA)/Linearized
 
+RAW_EDDA=$(EDDA)/Text
+TEI_EDDA=$(EDDA)/TEI
 RAW_LGE=$(LGE)/Text
-TOME_DIRS=$(wildcard $(RAW_LGE)/T*)
-TOMES=$(TOME_DIRS:$(RAW_LGE)/T%=%)
+EDDA_TOME_DIRS=$(wildcard $(RAW_EDDA)/T*)
+EDDA_TOMES=$(EDDA_TOME_DIRS:$(RAW_EDDA)/T%=%)
+LGE_TOME_DIRS=$(wildcard $(RAW_LGE)/T*)
+LGE_TOMES=$(LGE_TOME_DIRS:$(RAW_LGE)/T%=%)
 
 TEI_LGE=$(LGE)/TEI
 PARALLEL_LGE=$(LGE)/Parallel
 LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv
-RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
+RAW_PARALLEL_LGE=$(PARALLEL)/Text/LGE
+#RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
 LINEARIZED_PARALLEL_LGE_ROOT=$(PARALLEL_LGE)/Linearized
 TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI
 
-LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%)
+LINEARIZED_PARALLEL_EDDA=$(LINEARIZED_PARALLEL_EDDA_ROOT)/ $(EDDA_TOMES:%=$(LINEARIZED_PARALLEL_EDDA_ROOT)/T%)
+LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(LGE_TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%)
+LINEARIZED_PARALLEL=$(LINEARIZED_PARALLEL_EDDA) $(LINEARIZED_PARALLEL_LGE)
+
+STANZA_PARALLEL_EDDA=$(PARALLEL_EDDA)/stanza
 STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza
+STANZA_PARALLEL=$(STANZA_PARALLEL_EDDA) $(STANZA_PARALLEL_LGE)
+
+TXM_PARALLEL_EDDA=$(PARALLEL_EDDA)/TXM
+TXM_PARALLEL_LGE=$(PARALLEL_LGE)/TXM
+TXM_PARALLEL=$(TXM_PARALLEL_EDDA) $(TXM_PARALLEL_LGE)
+
+METADATA=$(EDDA_META) $(PARALLEL_EDDA_META) $(LGE_META_FROM_EDDA)
 
-METADATA=$(EDDA_META) $(LGE_META_FROM_EDDA)
+PARALLEL_DATA=$(PARALLEL_LINKS)
 
-all: $(METADATA) $(LINEARIZED_PARALLEL_LGE)
+#all: $(PARALLEL_DATA) $(METADATA) $(RAW_PARALLEL_EDDA) $(LINEARIZED_PARALLEL_EDDA) $(STANZA_PARALLEL) $(TXM_PARALLEL)
+all: $(PARALLEL_DATA) $(EDDA_META) $(PARALLEL_EDDA_META) $(TEI_PARALLEL_EDDA)
 
 $(EDDA_META): $(EDDA)/TEI/
-	./scripts/EDdA-metadata.py $< > $@
+	EDDA_API="$(EDDA_API)" ./scripts/EDdA-metadata.py $< $(EDDA_CORRESPONDANCES) > $@
+
+$(PARALLEL_LINKS): $(EDDA_META) $(RAW_LGE)
+	mkdir -p $(PARALLEL)
+	./scripts/parallel-links.py $^ $@
+
+$(PARALLEL_EDDA_META): $(EDDA_META)
+	mkdir -p 
+	./scripts/LGE-metadata-from-EDdA.py $^ $(RAW_LGE) $@
 
 $(LGE_META_FROM_EDDA): $(EDDA_META) $(PARALLEL_LGE)/
 	./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@
 
+$(RAW_PARALLEL_EDDA): $(PARALLEL_EDDA_META) $(RAW_EDDA)
+	./scripts/extract-parallel-EDdA.sh $^ $@
+
+$(TEI_PARALLEL_EDDA): $(PARALLEL_EDDA_META) $(TEI_EDDA)
+	./scripts/extract-parallel-EDdA.sh $^ $@
+
 $(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE)
-	./scripts/extract-parallel-LGE.sh $^ $@
+	bash ./scripts/extract-parallel-LGE.sh $^ $@
 
 $(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE)
 	./scripts/extract-parallel-LGE.sh $^ $@
 
-$(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE)
-	./scripts/stanza-annotator.py $< $@
+$(STANZA_PARALLEL_EDDA): $(LINEARIZED_PARALLEL_EDDA)
+	./scripts/stanza-txt-annotator.py $< $@
+
+$(STANZA_PARALLEL_LGE): $(LINEARIZED_PARALLEL_LGE)
+	./scripts/stanza-txt-annotator.py $< $@
 
-%/:
+$(TXM_PARALLEL_EDDA): $(STANZA_PARALLEL_EDDA)
+	./scripts/articles-to-volumes.hs $< $@
+
+$(TXM_PARALLEL_LGE): $(STANZA_PARALLEL_LGE)
+	./scripts/articles-to-volumes.hs $< $@
+
+$(LINEARIZED_PARALLEL_EDDA_ROOT)/T%: $(RAW_PARALLEL_EDDA)/T%
 	mkdir -p $@
+	find $< -type f -name '*.txt' | ./scripts/linearize.hs $@
 
 $(LINEARIZED_PARALLEL_LGE_ROOT)/T%: $(RAW_PARALLEL_LGE)/T%
 	mkdir -p $@
diff --git a/lib/Conllu/Parse/Paragraph.hs b/lib/Conllu/Parse/Paragraph.hs
new file mode 100644
index 0000000..8d76fb9
--- /dev/null
+++ b/lib/Conllu/Parse/Paragraph.hs
@@ -0,0 +1,28 @@
+{-# LANGUAGE NamedFieldPuns #-}
+module Conllu.Parse.Paragraph (
+      Paragraph(..)
+    , byParagraph
+  ) where
+
+import Conllu.Type (Doc, Sent(..))
+import Data.List (find, isPrefixOf)
+
+newtype Paragraph = Paragraph {
+    sentences :: [Sent]
+  }
+
+-- | Assumes the document starts with a newpar (otherwise the first sentences
+-- don't have a paragraph id to be associated to — this is always the case in
+-- the output of our stanza annotator)
+byParagraph :: Doc -> [Paragraph]
+byParagraph = walk []
+  where
+    walk stack (s:ss) =
+      case (getParagraph s, stack) of
+        (Just _, []) -> walk [s] ss
+        (Just _, _) -> Paragraph (reverse stack):(walk [s] ss)
+        _ -> walk (s:stack) ss
+    walk stack [] = [Paragraph $ reverse stack]
+
+getParagraph :: Sent -> Maybe String
+getParagraph = fmap snd . find (("newpar " `isPrefixOf`) . fst) . _meta
diff --git a/lib/Data/BIO.hs b/lib/Data/BIO.hs
new file mode 100644
index 0000000..f75c5ad
--- /dev/null
+++ b/lib/Data/BIO.hs
@@ -0,0 +1,45 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Data.BIO
+  ( BIO(..)
+  , Tree(..)
+  , bio ) where
+
+import Control.Applicative (empty)
+import Data.ByteString as BS (splitAt)
+import Data.Csv (FromField(..), ToField(..))
+import Data.Foldable (toList)
+
+data BIO a = Begin a | Inside a | Outside deriving (Show)
+
+instance FromField a => FromField (BIO a) where
+  parseField "O" = pure Outside
+  parseField f =
+    case BS.splitAt 2 f of
+      ("B-", tag) -> Begin <$> parseField tag
+      ("I-", tag) -> Inside <$> parseField tag
+      _ -> empty
+
+instance ToField a => ToField (BIO a) where
+  toField Outside = "O"
+  toField (Begin a) = "B-" <> toField a
+  toField (Inside a) = "I-" <> toField a
+
+data Tree a b = Leaf a | Node b [Tree a b] | Root [Tree a b] deriving (Show)
+
+bio :: (Eq b, Foldable t) => (a -> BIO b) -> t a -> Tree a b
+bio field = Root . atRoot . toList
+  where
+    atRoot [] = []
+    atRoot (a:as) =
+      case field a of
+        Outside -> (Leaf a):(atRoot as)
+        Begin b -> enter b a as
+        Inside b -> enter b a as
+    enter b a as =
+      let (nodes, otherTrees) = inside [] b as in
+      (Node b (Leaf a:nodes)):(atRoot otherTrees)
+    inside stack b0 l =
+      case field <$> take 1 l of
+        [Inside b1]
+          | b0 == b1 -> inside (Leaf (head l):stack) b0 (tail l)
+        _ -> (reverse stack, l)
diff --git a/lib/Data/Csv/DB.hs b/lib/Data/Csv/DB.hs
new file mode 100644
index 0000000..5119872
--- /dev/null
+++ b/lib/Data/Csv/DB.hs
@@ -0,0 +1,9 @@
+module Data.Csv.DB
+  (join) where
+
+join :: Eq x => (Maybe a -> Maybe b -> Maybe c) -> Vector a -> Vector b -> Vector c
+join selector 
+
+left :: (a -> f) -> Maybe a -> f
+left selector None =
+left selector (Just a) = selector a
diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs
index 582eaff..8af810b 100644
--- a/lib/Data/Metadata.hs
+++ b/lib/Data/Metadata.hs
@@ -1,38 +1,57 @@
-{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
 module Data.Metadata (
-      Metadata
-    , byTome
-    , metadata
+      Authors(..)
+    , Book(..)
+    , Domains(..)
+    , FromBook(..)
+    , HasAuthors(..)
+    , HasDomains(..)
+    , InFile(..)
+    , TXMText
+    , Unique(..)
+    , groupBy
+    , list
+    , readTsv
+    , sortBy
   ) where
 
-import Data.Attoparsec.Text (Parser, endOfLine, sepBy1, takeTill)
-import Data.List as List (foldl')
-import Data.Map as Map (Map, adjust, empty, foldl', insert, member)
-import Data.Metadata.Article (Article(..), article)
-import Data.Text (Text)
+import Data.ByteString.Lazy as ByteString (readFile, writeFile)
+import Data.ByteString.Char8 as StrictByteString (pack)
+import Data.Csv
+  ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..), ToRecord
+  , decodeWith, defaultEncodeOptions, encodeWith, header, namedField )
+import Data.Foldable as Foldable (toList)
+import Data.List (sortOn)
+import Data.Map.Strict as Map (alter, empty, toList)
+import Data.Metadata.Projector
+  (FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), TXMText, Unique(..))
+import Data.Metadata.Types (Authors(..), Book(..), Domains(..))
+import Data.Text as Text (Text, intercalate, unpack)
+import Data.Vector as Vector (Vector)
 
---type Metadata = Map Text Article
-type Metadata = [Article]
-type Tomes = Map Int Metadata
+list :: [Text] -> String
+list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":"
 
-metadata :: Parser Metadata
---metadata = List.foldl' indexByUid Map.empty <$>
---  (skipLine *> article `sepBy1` endOfLine)
-metadata = skipLine *> article `sepBy1` endOfLine
+readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a))
+readTsv source = decodeWith tsv HasHeader <$> ByteString.readFile source
   where
-    skipLine = takeTill (\c -> c == '\r' || c == '\n') *> endOfLine
-    indexByUid temp a@(Article {uid}) = Map.insert uid a temp
+    tsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
 
-byTome :: Metadata -> Tomes
---byTome = Map.foldl' sortByTome Map.empty
-byTome = List.foldl' sortByTome Map.empty
+writeTsv :: (Foldable t, ToRecord a) => [String] -> t a -> FilePath -> IO ()
+writeTsv fields content target =
+  ByteString.writeFile target
+  . encodeWith tsv (header bsFields)
+  $ zipWith namedField bsFields (Foldable.toList content)
   where
-    sortByTome temp a@(Article {uid, tome}) =
---        Map.adjust (Map.insert uid a) tome .
-        Map.adjust (a:) tome
---      . (if tome `member` temp then id else Map.insert tome Map.empty) $ temp
-      . (if tome `member` temp then id else Map.insert tome []) $ temp
-{-
-      . (if tome `member` temp then id else Map.insert tome Map.empty)
-      $ tome
-      -}
+    bsFields = StrictByteString.pack <$> fields
+    tsv = defaultEncodeOptions
+            { encDelimiter = fromIntegral (fromEnum '\t')
+            , encUseCrLf = False }
+
+sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a]
+sortBy field = sortOn field . Foldable.toList
+
+groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])]
+groupBy field = Map.toList . foldr group Map.empty
+  where
+    group article = Map.alter (Just . maybe [article] (article:)) (field article)
diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs
index 300c3d8..ba8c694 100644
--- a/lib/Data/Metadata/Article.hs
+++ b/lib/Data/Metadata/Article.hs
@@ -1,30 +1,41 @@
-{-# LANGUAGE OverloadedStrings #-}
-module Data.Metadata.Article (
-      Article(..)
-    , article
-  ) where
-
-import Data.Attoparsec.Text (Parser, char, decimal, sepBy1, string, takeTill)
-import Data.Text (Text, stripEnd)
-
-data Article = Article {
-      uid :: Text
-    , tome :: Int
-    , rank :: Int
-    , headWord :: Text
-    , domains :: [Text]
-  } deriving Show
-
-article :: Parser Article
-article = Article
-  <$> cell
-  <*> (decimal <* char ',')
-  <*> (decimal <* char ',')
-  <*> cell
-  <*> domains_
-  where
-    cell = takeTill (== ',') <* char ','
-
-domains_ :: Parser [Text]
-domains_ = fmap stripEnd <$>
-  (takeTill (`elem` ['|', '\r', '\n'])) `sepBy1` (string "| ")
+{- LANGUAGE DeriveGeneric, OverloadedStrings #-}
+module Data.Metadata.Article
+  ( Article(..) ) where
+
+import Data.Csv (FromRecord(..), ToRecord(..))
+import Data.Metadata
+  ( Authors, Book, Domains
+  , FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), Unique(..) )
+import Data.Text (Text, unpack)
+import GHC.Generics (Generic)
+import Text.Printf (printf)
+
+data Article = Article
+  { articleBook :: Book
+  , tome :: Int
+  , name :: Text
+  , headWord :: Text
+  , rank :: Int 
+  , page :: Int 
+  , articleAuthors :: Authors
+  , articleDomains :: Domains } deriving (Generic, Show)
+
+instance FromRecord Article
+instance ToRecord Article
+
+instance Unique Article where
+  uid (Article {articleBook, tome, name}) =
+    printf "%s_%d_%s" (show articleBook) tome (unpack name)
+
+instance FromBook Article where
+  book = articleBook
+
+instance HasAuthors Article where
+  authors_ = articleAuthors
+
+instance HasDomains Article where
+  domains_ = articleDomains
+
+instance InFile Article where
+  relativePath (Article {articleBook, tome, name}) =
+    printf "%s/T%d/%s" (show articleBook) tome (unpack name)
diff --git a/lib/Data/Metadata/Paragraph.hs b/lib/Data/Metadata/Paragraph.hs
new file mode 100644
index 0000000..2f59105
--- /dev/null
+++ b/lib/Data/Metadata/Paragraph.hs
@@ -0,0 +1,40 @@
+module Data.Metadata.Paragraph
+  ( Paragraph(..) ) where
+
+import Data.Csv (FromRecord(..), ToRecord(..))
+import Data.Metadata
+  ( Authors, Book(..), Domains
+  , FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), Unique(..) )
+import GHC.Generics (Generic)
+import Text.Printf (printf)
+
+data Paragraph = Paragraph
+  { tome :: Int
+  , name :: String
+  , headWord :: String
+  , paragraphAuthors :: Authors
+  , paragraphDomains :: Domains
+  , paragraphId :: Int
+  , content :: String
+  , nbPlaceSpacy :: Int
+  , nbPersonSpacy :: Int } deriving (Generic, Show)
+
+instance FromRecord Paragraph
+instance ToRecord Paragraph
+
+instance Unique Paragraph where
+  uid (Paragraph {tome, name, paragraphId}) =
+    printf "EDdA_%d_%s_%d" tome name paragraphId
+
+instance FromBook Paragraph where
+  book _ = EDdA
+
+instance HasAuthors Paragraph where
+  authors_ = paragraphAuthors
+
+instance HasDomains Paragraph where
+  domains_ = paragraphDomains
+
+instance InFile Paragraph where
+  relativePath (Paragraph {tome, name, paragraphId}) =
+    printf "T%d/article%s-%d" tome name paragraphId
diff --git a/lib/Data/Metadata/Projector.hs b/lib/Data/Metadata/Projector.hs
new file mode 100644
index 0000000..f37d5f5
--- /dev/null
+++ b/lib/Data/Metadata/Projector.hs
@@ -0,0 +1,33 @@
+module Data.Metadata.Projector
+  ( FromBook(..)
+  , HasAuthors(..) 
+  , HasDomains(..)
+  , InFile(..)
+  , TXMText
+  , Unique(..) ) where
+
+import Data.Metadata.Types (Authors(..), Book, Domains(..))
+import Data.Text (Text)
+
+class Unique a where
+  uid :: a -> String
+
+class FromBook a where
+  book :: a -> Book
+
+class HasAuthors a where
+  authors_ :: a -> Authors
+
+  authors :: a -> [Text]
+  authors = getAuthors . authors_
+
+class HasDomains a where
+  domains_ :: a -> Domains
+
+  domains :: a -> [Text]
+  domains = getDomains . domains_
+
+class InFile a where
+  relativePath :: a -> FilePath
+
+type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a)
diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs
new file mode 100644
index 0000000..2383ad5
--- /dev/null
+++ b/lib/Data/Metadata/Trie.hs
@@ -0,0 +1,54 @@
+{-# LANGUAGE NamedFieldPuns #-}
+module Data.Metadata.Trie
+  () where
+
+import Data.Foldable (toList)
+import Data.Map (Map)
+import qualified Data.Map as Map (empty, insert, lookup, singleton)
+
+data Trie e n =
+  Trie
+    { store :: Maybe n
+    , edges :: Edges e n }
+type Edges e n = Map e (Trie e n)
+
+data Zipper e n =
+    Top
+  | Zip
+      { atValue :: Maybe n
+      , otherEdges :: Edges e n
+      , byEdge :: e
+      , above :: Zipper e n }
+
+type EditState e n = (Trie e n, Zipper e n)
+
+empty :: Trie e n
+empty = Trie { store = Nothing, edges = Map.empty }
+
+zipUp :: Ord e => EditState e n -> Trie e n
+zipUp (t, Top) = t
+zipUp (t, Zip {atValue, otherEdges, byEdge, above}) =
+  zipUp
+    ( Trie { store = atValue, edges = Map.insert byEdge t otherEdges }
+    , above )
+
+trieOf :: Foldable t => t e -> n -> Trie e n
+trieOf path n = foldr insertAt (Trie { store = Just n, edges = Map.empty }) path
+  where
+    insertAt e t = Trie { store = Nothing, edges = Map.singleton e t }
+
+at :: (Foldable t, Ord e) => Trie e n -> t e -> Maybe n
+at trie = at_ trie . toList
+  where
+    at_ (Trie {store}) [] = store
+    at_ (Trie {edges}) (e:es) = Map.lookup e edges >>= (`at_` es)
+
+insert :: Foldable t => t e -> n -> Trie e n -> Trie e n
+insert path n trie = zipUp (edit (trie, Top))
+  where
+    edit 
+
+type Indexed e n = ([e], n)
+
+index :: (Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n
+index by = foldr (uncurry insert) empty . fmap by
diff --git a/lib/Data/Metadata/Types.hs b/lib/Data/Metadata/Types.hs
new file mode 100644
index 0000000..608bad0
--- /dev/null
+++ b/lib/Data/Metadata/Types.hs
@@ -0,0 +1,38 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Metadata.Types
+  ( Authors(..)
+  , Book(..)
+  , Domains(..) ) where
+
+import Control.Applicative (empty)
+import Data.Csv (Field, FromField(..), Parser, ToField(..))
+import Data.Text (Text, intercalate, splitOn)
+
+data Book = EDdA | LGE | Wikipedia deriving (Eq, Ord, Read, Show)
+
+newtype Authors = Authors
+  { getAuthors :: [Text] } deriving Show
+newtype Domains = Domains
+  { getDomains :: [Text] } deriving Show
+
+instance FromField Book where
+  parseField "EDdA" = pure EDdA
+  parseField "LGE" = pure LGE
+  parseField "Wikipedia" = pure Wikipedia
+  parseField _ = empty
+instance ToField Book where
+  toField = toField . show
+
+sepBy :: Text -> Field -> Parser [Text]
+sepBy s = fmap (splitOn s) . parseField
+
+instance FromField Authors where
+  parseField = fmap Authors . sepBy " & "
+instance ToField Authors where
+  toField = toField . intercalate " & ". getAuthors
+
+instance FromField Domains where
+  parseField = fmap Domains . sepBy " | "
+instance ToField Domains where
+  toField = toField . intercalate " | ". getDomains
+
diff --git a/lib/Data/SpaCy.hs b/lib/Data/SpaCy.hs
new file mode 100644
index 0000000..79a52a8
--- /dev/null
+++ b/lib/Data/SpaCy.hs
@@ -0,0 +1,16 @@
+module Data.SpaCy
+  ( Annotation(..) ) where
+
+import Data.BIO (BIO)
+import Data.Csv (FromRecord(..), ToRecord(..))
+import GHC.Generics (Generic)
+
+data Annotation = Annotation
+  { tokenNumber :: Int
+  , form :: String
+  , lemma :: String
+  , pos :: String
+  , namedEntity :: BIO String } deriving (Generic, Show)
+
+instance FromRecord Annotation
+instance ToRecord Annotation
diff --git a/lib/Text/Editor.bak.hs b/lib/Text/Editor.bak.hs
new file mode 100644
index 0000000..5f7e00b
--- /dev/null
+++ b/lib/Text/Editor.bak.hs
@@ -0,0 +1,101 @@
+{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-}
+module Text.Editor (
+    --  Editor
+    --, Filter(..)
+      Editable(..)
+    --, (|->)
+    --, (|=>)
+    , (<>.)
+    , editor
+    --, pureEditor
+    , fileFilter
+    --, runAll
+    , stdinList
+  ) where
+
+import Control.Monad.IO.Class (MonadIO(..))
+import Data.Text as Text (Text, intercalate, lines, pack, unpack)
+import Data.Text.IO as Text (readFile, writeFile)
+import System.FilePath (replaceDirectory, replaceExtension)
+
+data Filter m a b = Filter {
+      transform :: a -> m b
+    , outputFormat :: Maybe String
+  }
+
+type Editor m a = Filter m a a
+
+{-
+data Editor m a = Editor {
+      transform :: a -> m a
+    , outputFormat :: Maybe String
+  }
+-}
+
+{-
+editor :: (a -> m a) -> Editor m a
+editor f = Filter { transform = f, outputFormat = Nothing }
+--editor f = Editor { transform = f, outputFormat = Nothing }
+
+pureEditor :: Applicative m => (a -> a) -> Editor m a
+pureEditor = editor . (pure .)
+-}
+
+class Editable a where
+  enter :: Text -> a
+  leave :: a -> Text
+
+instance Editable [Text] where
+  enter = Text.lines
+  leave = Text.intercalate "\n"
+
+instance Editable String where
+  enter = Text.unpack
+  leave = Text.pack
+
+type EditionProcess m = FilePath -> FilePath -> m ()
+
+fileFilter :: (Editable a, Editable b, MonadIO m) =>
+  (FilePath -> FilePath) -> (a -> m b) -> EditionProcess m
+fileFilter fixFormat transform target input =
+--(|->) (Editor {transform, outputFormat}) target input =
+    liftIO (Text.readFile input)
+  >>= transform . enter
+  >>= liftIO . Text.writeFile output . leave
+  where
+    output = fixFormat $ replaceDirectory input target
+
+(<>.) :: (Editable a, Editable b, MonadIO m) =>
+  (a -> m b) -> String -> EditionProcess m
+(<>.) transform extension = fileFilter (`replaceExtension` extension) transform
+
+editor :: (Editable a, Editable b, MonadIO m) =>
+  (a -> m b) -> EditionProcess m 
+editor = fileFilter id
+
+{-
+run :: (Editable a, Editable b, MonadIO m) => Filter m a b -> FilePath -> FilePath -> m ()
+run = (|->)
+
+(|->) :: (Editable a, Editable b, MonadIO m) => Filter m a b -> FilePath -> FilePath -> m ()
+--(|->) :: (Editable a, MonadIO m) => Editor m a -> FilePath -> FilePath -> m ()
+(|->) (Filter {transform, outputFormat}) target input =
+--(|->) (Editor {transform, outputFormat}) target input =
+    liftIO (Text.readFile input)
+  >>= transform . enter
+  >>= liftIO . Text.writeFile output . leave
+  where
+    fixFormat = maybe id (flip replaceExtension) outputFormat
+    output = fixFormat $ replaceDirectory input target
+
+runAll :: (Editable a, Foldable t, MonadIO m) => Editor m a -> FilePath -> t FilePath -> m ()
+runAll = (|=>)
+
+-- (|=>) :: (Editable a, MonadIO m) => Editor m a -> FilePath -> String -> m ()
+(|=>) :: (Editable a, Foldable t, MonadIO m) => Editor m a -> FilePath -> t FilePath -> m ()
+--editor |=> target = mapM_ (editor |-> target) . Prelude.lines
+editor |=> target = mapM_ (editor |-> target)
+-}
+
+stdinList :: MonadIO m => m [FilePath]
+stdinList = Prelude.lines <$> liftIO getContents
diff --git a/lib/Text/Editor.hs b/lib/Text/Editor.hs
index b5c4148..2d7a68b 100644
--- a/lib/Text/Editor.hs
+++ b/lib/Text/Editor.hs
@@ -1,19 +1,17 @@
-{-# LANGUAGE FlexibleInstances, OverloadedStrings #-}
+{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-}
 module Text.Editor (
-      Editor
-    , edit
-    , editAll
-    , editM
-    , editAllM
+      Editable(..)
+    , (<>.)
+    , apply
+    , fileFilter
+    , xargs
   ) where
 
 import Control.Monad.IO.Class (MonadIO(..))
+import Data.ByteString.Lazy as BS (ByteString, pack, unpack)
 import Data.Text as Text (Text, intercalate, lines, pack, unpack)
 import Data.Text.IO as Text (readFile, writeFile)
-import System.FilePath (replaceDirectory)
-
-type Editor a = a -> a
-type EditorM m a = a -> m a
+import System.FilePath (replaceDirectory, replaceExtension)
 
 class Editable a where
   enter :: Text -> a
@@ -27,26 +25,23 @@ instance Editable String where
   enter = Text.unpack
   leave = Text.pack
 
--- | Apply an 'Editor' to the content of a file which path is passed as third
--- argument. The second argument is the path to the target folder where the
--- edited version will be created.
-editM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> FilePath -> m ()
-editM editor target input =
+type Editor m = FilePath -> FilePath -> m ()
+
+fileFilter :: (Editable a, Editable b, MonadIO m) =>
+  (FilePath -> FilePath) -> (a -> m b) -> Editor m
+fileFilter fixFormat transform target input =
     liftIO (Text.readFile input)
-  >>= editor . enter
+  >>= transform . enter
   >>= liftIO . Text.writeFile output . leave
   where
-    output = replaceDirectory input target
+    output = fixFormat $ replaceDirectory input target
 
--- | A convenient shortcut of 'editM' for pure editors
-edit :: Editable a => Editor a -> FilePath -> FilePath -> IO ()
-edit editor = editM (pure . editor)
+(<>.) :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> String -> Editor m
+(<>.) transform extension = fileFilter (`replaceExtension` extension) transform
 
--- | Apply an 'Editor' like 'edit' on all the files which paths are expected to
--- be read from the input 'String', one per line.
-editAllM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> String -> m ()
-editAllM editor target = mapM_ (editM editor target) . Prelude.lines
+apply :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> Editor m
+apply = fileFilter id
 
--- | A convenient shortcut of 'editAllM' for pure editors
-editAll :: Editable a => Editor a -> FilePath -> String -> IO ()
-editAll editor = editAllM (pure . editor)
+xargs :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> FilePath -> m ()
+xargs transform target =
+  Prelude.lines <$> liftIO getContents >>= mapM_ (apply transform target)
diff --git a/lib/Text/Filter.hs b/lib/Text/Filter.hs
new file mode 100644
index 0000000..d55000e
--- /dev/null
+++ b/lib/Text/Filter.hs
@@ -0,0 +1,50 @@
+{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-}
+module Text.Filter (
+      Editable(..)
+    , (<>.)
+    , apply
+    , fileFilter
+    , xargs
+  ) where
+
+import Control.Monad.IO.Class (MonadIO(..))
+import Data.Text as Text (Text, intercalate, lines, pack, unpack)
+import Data.Text.IO as Text (readFile, writeFile)
+import System.FilePath (replaceDirectory, replaceExtension)
+
+class Editable a where
+  enter :: Text -> a
+  leave :: a -> Text
+
+instance Editable Text where
+  enter = id
+  leave = id
+
+instance Editable [Text] where
+  enter = Text.lines
+  leave = Text.intercalate "\n"
+
+instance Editable String where
+  enter = Text.unpack
+  leave = Text.pack
+
+type Filter m = FilePath -> FilePath -> m ()
+
+fileFilter :: (Editable a, Editable b, MonadIO m) =>
+  (FilePath -> FilePath) -> (a -> m b) -> Filter m
+fileFilter fixFormat transform target input =
+    liftIO (Text.readFile input)
+  >>= transform . enter
+  >>= liftIO . Text.writeFile output . leave
+  where
+    output = fixFormat $ replaceDirectory input target
+
+(<>.) :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> String -> Filter m
+(<>.) transform extension = fileFilter (`replaceExtension` extension) transform
+
+apply :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> Filter m
+apply = fileFilter id
+
+xargs :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> FilePath -> m ()
+xargs transform target =
+  Prelude.lines <$> liftIO getContents >>= mapM_ (apply transform target)
diff --git a/lib/Text/TEI.hs b/lib/Text/TEI.hs
new file mode 100644
index 0000000..7514df9
--- /dev/null
+++ b/lib/Text/TEI.hs
@@ -0,0 +1,99 @@
+module Text.TEI
+  ( (.=)
+  , corpusHeader
+  , publicationStmt
+  , sourceDesc
+  , teiHeader
+  , text
+  , title ) where
+
+import Data.Metadata
+  ( Book(..), FromBook(..), HasAuthors(..), HasDomains(..), TXMText, Unique(..)
+  , list )
+import Text.XML.HXT.Core ( ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt )
+
+corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree
+corpusHeader EDdA = eddaHeader
+corpusHeader LGE = lgeHeader
+corpusHeader Wikipedia = wikiHeader
+
+eddaHeader :: ArrowXml a => String -> a b XmlTree
+eddaHeader strTome =
+  teiHeader
+    (titleStmt strTome ("Digitized" `by` "University of Chicago Library"
+                       ++ "Published" `by` "ARTFL"))
+    (corpusPublicationStmt)
+    (sourceDesc
+      [ title ("L'Encyclopédie T" <> strTome)
+      , selem "author" [ txt "Collective" ]
+      , selem "creation" [ selem "date" [ txt "1752" ] ] ])
+
+lgeHeader :: ArrowXml a => String -> a b XmlTree
+lgeHeader strTome =
+  teiHeader
+    (titleStmt strTome ("Digitized" `by` "Bibliothèque Nationale de France"))
+    (corpusPublicationStmt)
+    (sourceDesc
+      [ title ("La Grande Encyclopédie T" <> strTome)
+      , selem "author" [ txt "Collective" ]
+      , selem "creation" [ selem "date" [ txt "1885" ] ]
+      , selem "imprint" [ selem "date" [ txt "1885" ]
+                        , selem "publisher" [ txt "H. Lamirault et Cie," ]
+                        , selem "pubplace" [ txt "Paris" ] ]
+      , mkelem "biblScope" [ attr "unit" (txt "volume") ] [ txt strTome ] ])
+
+wikiHeader :: ArrowXml a => String -> a b XmlTree
+wikiHeader _ =
+  teiHeader
+    (titleStmt "Wikipédia, L'encyclopédie libre" [])
+    (corpusPublicationStmt)
+    (sourceDesc
+      [ selem "author" [ selem "orgName" [ txt "Wikimedia Foundation" ] ] ])
+
+teiHeader :: ArrowXml a =>
+  a b XmlTree -> a b XmlTree -> a b XmlTree -> a b XmlTree
+teiHeader aTitle aPublication aSource =
+  selem "teiHeader"
+    [ selem "fileDesc" [ aTitle, aPublication, aSource ] ]
+
+titleStmt :: ArrowXml a => String -> [a b XmlTree] -> a b XmlTree
+titleStmt tome resps =
+  selem "titleStmt"
+    [ title tome
+    , selem "respStmt" (resps ++ "Annotated and encoded" `by` "ICAR") ]
+
+title :: ArrowXml a => String -> a b XmlTree
+title titleContent = selem "title" [txt titleContent]
+
+corpusPublicationStmt :: ArrowXml a => a b XmlTree
+corpusPublicationStmt =
+  (publicationStmt
+    [selem "distributor"
+      [selem "orgName" [txt "Project GÉODE"]
+      ,selem "address"
+          [selem "addrline" [txt "ICAR UMR 5191"]
+          ,selem "addrline" [txt "ENS de Lyon"]]]])
+
+publicationStmt :: ArrowXml a => [a b XmlTree] -> a b XmlTree
+publicationStmt = selem "publicationStmt"
+
+sourceDesc :: ArrowXml a => [a b XmlTree] -> a b XmlTree
+sourceDesc biblContent =
+  selem "sourceDesc" [selem "bibl" biblContent]
+
+by :: ArrowXml a => String -> String -> [a b XmlTree]
+by resp orgName = [ selem "resp" [ txt (resp <> " by") ]
+                  , selem "orgName" [ txt orgName ] ]
+
+text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree
+text unitName txmText body =
+  mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ]
+  where
+    metadata =
+      [ "uid" .= uid txmText
+      , "book" .= show (book txmText)
+      , "author" .= list (authors txmText)
+      , "domains" .= list (domains txmText) ]
+
+(.=) :: ArrowXml a => String -> String -> a b XmlTree
+key .= value = attr key (txt value)
diff --git a/manifest.scm b/manifest.scm
index 19fcf85..af89124 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -3,21 +3,27 @@
              ((gnu packages commencement) #:select (gcc-toolchain))
              ((gnu packages haskell) #:select (ghc))
              ((gnu packages haskell-web) #:select (ghc-hxt))
-             ((gnu packages haskell-xyz) #:select (ghc-attoparsec))
+             ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
              ((gnu packages python) #:select (python))
+             ((gnu packages python-science) #:select (python-pandas))
              ((gnu packages python-xyz) #:select (python-beautifulsoup4))
              ((gnu packages xml) #:select (python-lxml)))
 
+(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
+
 (packages->manifest
   (list
     coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
     gcc-toolchain ; running haskell
     ghc ; running haskell
-    ghc-attoparsec ; parsing metadata
+    ghc-cassava ; working with CSV in haskell
+    ghc-hs-conllu ; working on syntax-annotated documents
     ghc-hxt ; working on xml documents
     python ; scripts
     python-beautifulsoup4 ; extract EDdA metadata from TEI files
+    python-edda ; TODO
     python-lxml ; fusion articles into tomes for TXM
+    python-pandas ; working with CSV in python
     python-stanza ; annotation
     stanza-fr ; annotation
     ))
diff --git a/scripts/EDdA-metadata.py b/scripts/EDdA-metadata.py
index 05c4ae2..e804019 100755
--- a/scripts/EDdA-metadata.py
+++ b/scripts/EDdA-metadata.py
@@ -2,17 +2,39 @@
 
 import csv
 from bs4 import BeautifulSoup
+from EDdA import ENCCRE
+import pandas
 import os
 import sys
 
-header = ["T", "article", "head", "domain"]
+header = ["T", "article", "head", "author", "ARTFL_domain", "ENCCRE_domain"]
 
 def getAttribute(article, attribute):
     result = article.find(type=attribute)
     return result.get('value') if result else ''
 
-def main(rootDirectory):
-    output = csv.writer(sys.stdout, lineterminator='\n')
+def idARTFLToENCCRE(correspondances, tome, article):
+    try :  
+        row = correspondances.loc[(correspondances['tome']==tome) &
+                                  (correspondances['article']==article)].reset_index(drop=True)
+        return row['entreeid'][0]
+    except KeyError:
+        return None
+
+ENCCREDomains = ENCCRE.query('domaines')
+
+def getENCCREDomain(enccreId):
+    if enccreId:
+        article = ENCCRE.query('article/%s' % enccreId)
+        annotation = article['annotations']
+        if 'constit' in annotation and 'domgen' in annotation['constit'][0]:
+            domains = annotation['constit'][0]['domgen']
+            return ' | '.join(
+                set(map(lambda d: ENCCREDomains[d]['dgrid'], domains))
+                )
+
+def main(rootDirectory, correspondances):
+    output = csv.writer(sys.stdout, lineterminator='\n', delimiter='\t')
     output.writerow(header)
     for t in range(1,18):
         path = f"{rootDirectory}/T{t}"
@@ -23,8 +45,10 @@ def main(rootDirectory):
                 t,
                 rank,
                 getAttribute(root, "head"),
-                normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass")
+                getAttribute(root, "author"),
+                normclass,
+                getENCCREDomain(idARTFLToENCCRE(correspondances, t, rank))
                 ])
 
 if __name__ == '__main__':
-    main(sys.argv[1])
+    main(sys.argv[1], pandas.read_csv(sys.argv[2]))
diff --git a/scripts/LGE-metadata-from-EDdA.py b/scripts/LGE-metadata-from-EDdA.py
index 1fabdee..ab1119a 100755
--- a/scripts/LGE-metadata-from-EDdA.py
+++ b/scripts/LGE-metadata-from-EDdA.py
@@ -8,7 +8,8 @@ def EDdARow(columns):
             'tome': columns[0],
             'article': columns[1],
             'head': columns[2],
-            'domain': columns[3]
+            'ARTFL_domain': columns[3],
+            'ENCCRE_domain': columns[4]
             }
 
 def LGERow(tome):
@@ -140,14 +141,19 @@ def interesting(arrow):
 def getMetadata(arrows, path=None):
     output = sys.stdout if path is None else open(path, 'w')
     toCsv = csv.writer(output, lineterminator='\n')
-    toCsv.writerow(['id', 'tome', 'rank', 'head', 'domain'])
+    toCsv.writerow(
+            ['id', 'tomeLGE', 'rankLGE', 'head', 'tomeEDdA', 'rankEDdA', 'ARTFL_domain', 'ENCCRE_domain']
+            )
     for arrow in arrows:
         toCsv.writerow([
             arrow['target']['id'],
             arrow['target']['tome'],
             arrow['target']['rank'],
             arrow['target']['head'],
-            arrow['source']['domain']
+            arrow['source']['tome'],
+            arrow['source']['article'],
+            arrow['source']['ARTFL_domain'],
+            arrow['source']['ENCCRE_domain']
             ])
 
 if __name__ == '__main__':
diff --git a/scripts/add-domain-metadata.hs b/scripts/add-domain-metadata.hs
new file mode 100755
index 0000000..43ec1e1
--- /dev/null
+++ b/scripts/add-domain-metadata.hs
@@ -0,0 +1,50 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE DeriveGeneric, NamedFieldPuns, OverloadedStrings #-}
+
+--import Data.ByteString.Lazy as ByteString (readFile)
+import Data.Csv (FromRecord)
+import Data.Metadata (Domains(..), readCsv)
+--import Data.Metadata (Article, articles)
+import Data.Text as Text (Text, intercalate, isPrefixOf)
+import GHC.Generics (Generic)
+import System.Environment (getArgs)
+import System.FilePath ((</>), (<.>))
+import System.Exit (die)
+import System.Script (syntax)
+import Text.Editor (apply)
+
+data Line = Line {
+      tome :: String
+    , rank :: String
+    , domains :: Domains
+  } deriving (Generic)
+
+instance FromRecord Line
+
+to :: FilePath -> FilePath -> Line -> IO ()
+to source target (Line {tome, rank, domains}) =
+  apply (pure . addMeta domains) target input
+  where
+    input = source </> "T" <> tome <> "_article" <> rank <.> "xml"
+
+addMeta :: Domains -> [Text] -> [Text]
+addMeta (Domains domains) textLines =
+    take 6 textLines
+  ++ [("domain\t" <> intercalate " | " domains)
+     ,("class\t" <> getClass domains)]
+  ++ drop 6 textLines
+
+getClass :: [Text] -> Text
+getClass domainsList
+  | all hasGeo domainsList = "geography"
+  | any hasGeo domainsList = "some_geography"
+  | otherwise = "other"
+  where
+    hasGeo = ("Géographie" `isPrefixOf`)
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [inputCSV, source, target] =
+      readCsv inputCSV >>= either die (mapM_ (source `to` target))
+    run _ = syntax "METADATA_CSV_FILE SOURCE TARGET"
diff --git a/scripts/articlesToVolume.sh b/scripts/articles-to-volume.sh
similarity index 100%
rename from scripts/articlesToVolume.sh
rename to scripts/articles-to-volume.sh
diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs
new file mode 100755
index 0000000..705f39a
--- /dev/null
+++ b/scripts/conllu-to-lexicoscope.hs
@@ -0,0 +1,95 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+
+import Conllu.Parse (parseConllu)
+import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
+import Conllu.Print (printSent)
+import Conllu.Type (Doc, Sent(..))
+import Control.Monad.IO.Class (MonadIO(..))
+import Control.Monad.State (StateT(..), evalStateT)
+import Data.List as List (intercalate)
+import Data.Map (toList)
+import Data.Metadata (Article(..), Authors(..), Domains(..), Tome, byTome, list)
+import Data.Text as Text (unpack)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>), (<.>))
+import System.Script (syntax)
+import Text.XML.HXT.Core
+  ((>>>), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, attr, mkelem, selem
+  , txt, writeDocument)
+import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
+import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
+
+type Serial = (Int, Int)
+
+next :: Selector Serial Int -> IOStateArrow Serial a String
+next intSelector = IOSLA getUpdate
+  where
+    getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
+    field = theUserState >>> intSelector
+
+pId :: Selector Serial Int
+pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
+
+sId :: Selector Serial Int
+sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
+
+to :: FilePath -> FilePath -> (Int, Tome) -> StateT Serial IO ()
+to source target (tomeNumber, tome) =
+  mapM_ (processArticle (source </> tomeDir) (target </> tomeDir)) tome
+  where
+    tomeDir = "T" <> show tomeNumber
+
+processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
+processArticle source target a@(Article {uid, rank}) =
+  liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input)
+    >>= either (liftIO . die) (format a output) . parseConllu input
+  where
+    input = source </> Text.unpack uid <.> "conllu"
+    output = target </> Text.unpack uid <.> "xml"
+    --input = source </> "article" ++ show rank <.> "conllu"
+    --output = target </> "article" ++ show rank <.> "xml"
+
+format :: Article -> FilePath -> Doc -> StateT Serial IO ()
+format _ _ [] = pure ()
+format article outputPath dom = StateT $ \s -> do
+  (newState, _) <- runIOSLA (xml >>> writeDocument [] outputPath) (initialState s) ()
+  pure ((), getS theUserState newState)
+  where
+    xml = selem "/"
+            [selem "corpus"
+              [selem "doc"
+                [selem "meta" [metaFrom article]
+                ,selem "text" (formatParagraph <$> byParagraph dom)]]]
+
+metaFrom :: ArrowXml a => Article -> a n XmlTree
+metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $
+  concatMap (++ "\n")
+    (List.intercalate "\t" <$>
+      [[]
+      ,["fileName", Text.unpack uid]
+      ,["tome", show tome]
+      ,["rank", show rank]
+      ,["head", Text.unpack headWord]
+      ,["author", Text.unpack . list $ getAuthors authors]
+      ,["domain", Text.unpack . list $ getDomains domains]])
+
+formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree
+formatParagraph (Paragraph sents) =
+  mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
+
+formatSentence :: Sent -> IOStateArrow Serial n XmlTree
+formatSentence s =
+  mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
+  where
+    sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    withSerial = (`evalStateT` (0, 0))
+    cli [sourceMeta, sourceRoot, targetRoot] =
+        byTome sourceMeta
+      >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . toList)
+    cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/conllu-to-txm.hs b/scripts/conllu-to-txm.hs
new file mode 100755
index 0000000..8487819
--- /dev/null
+++ b/scripts/conllu-to-txm.hs
@@ -0,0 +1,97 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+
+import Conllu.Parse (parseConllu)
+import Conllu.Type (AW, CW(..), ID(..), Sent(..))
+import Control.Applicative ((<|>), liftA2)
+import Data.Metadata
+  ( HasAuthors(..), Book, FromBook(..), relativePath, uid, groupBy, readTsv
+  , sortBy )
+import Data.Metadata.Article (Article(..))
+import Data.Text as Text (unpack)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>), (<.>))
+import System.Script (syntax)
+import Text.TEI
+  ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
+import Text.XML.HXT.Core
+  ( (>>>), (|||), ($<), IOSLA(..), IOSArrow, XmlTree, arr, arrIO, arrIO0, arrL
+  , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent
+  , writeDocument )
+
+to :: FilePath -> FilePath -> (Book, [Article]) -> IO ()
+to source target (theBook, metadata) =
+  mapM_ runArrow . groupBy tome $ sortBy rank metadata
+  where
+    runArrow (tomeNumber, tomeArticles) =
+      runX $ compileTome source target (theBook, tomeNumber, tomeArticles)
+
+compileTome ::
+  FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree
+compileTome source target (theBook, tomeNumber, metadata) =
+  tomeXml >>> writeDocument [withIndent True] output
+  where
+    strTome = show tomeNumber
+    output = target </> (show theBook) </> "T" <> strTome <.> "xml"
+    tomeXml = selem "/"
+                [ selem "teiCorpus"
+                  [ corpusHeader theBook strTome
+                  , (articleFrom source $< constL metadata) ] ]
+
+articleFrom :: FilePath -> Article -> IOSArrow b XmlTree
+articleFrom source article@(Article {headWord}) =
+  selem "TEI"
+    [ teiHeader
+        (selem "titleStmt" [title $ Text.unpack headWord])
+        (publicationStmt
+          [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ])
+        (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
+    , text "article" article (loadConllu input) ]
+  where
+    input = source </> relativePath article <.> "conllu"
+    authorArrow = constL (Text.unpack <$> authors article)
+
+{-
+metadataFrom :: Article -> [ IOSArrow b XmlTree ]
+metadataFrom article@(Article {book, authors, domains}) =
+  [ "id" .= uid article
+  , "book" .= show book
+  , "author" .= list (getAuthors authors)
+  , "domains" .= list (getDomains domains) ]
+-}
+
+loadConllu :: FilePath -> IOSArrow b XmlTree
+loadConllu input = arrIO0 (parseConllu input <$> readFile input)
+                  >>> ((arrIO die) ||| format)
+  where
+    format = selem "body" [ unlistA >>> formatSentence ]
+    formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ]
+    formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma)
+                            , attr "pos" (txt $< getMaybe pos) ]
+                            [ txt $< getMaybe _form ]
+    pos = liftA2 (<|>) spos _xpos
+
+getMaybe :: (a -> Maybe b) -> IOSArrow a b
+getMaybe projector = IOSLA $ \s a -> pure (s, maybe [] (:[]) $ projector a)
+
+render :: [CW AW] -> [CW AW]
+render [] = []
+render (w0@(CW {_id = MID a0 b0}):w1@(CW {_id = SID a1}):w2@(CW {_id = SID b2}):ws)
+  | a0 == a1 && b0 == b2 =
+    w0 {_lemma = glue _lemma, _upos = Nothing, _xpos = glue spos}:(render ws)
+  where
+    glue f = (\a b -> a ++ "+" ++ b) <$> (f w1) <*> (f w2)
+render (w:ws) = w:(render ws)
+
+spos :: CW a -> Maybe String
+spos = fmap show . _upos
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    books = groupBy book . sortBy uid
+    cli [inputMeta, sourceRoot, targetRoot] =
+        readTsv inputMeta
+      >>= either die (mapM_ (sourceRoot `to` targetRoot) . books)
+    cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/detokenizer.hs b/scripts/detokenizer.hs
new file mode 100755
index 0000000..124cbb1
--- /dev/null
+++ b/scripts/detokenizer.hs
@@ -0,0 +1,22 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall"
+{-# LANGUAGE OverloadedStrings #-}
+module Main where
+
+import Data.Text (Text, lines, unsnoc)
+import Data.Text.IO (interact)
+import Prelude hiding (interact, lines)
+
+detokenize :: [Text] -> Text
+detokenize [] = "\n"
+detokenize [w] = w <> "\n"
+detokenize (w:c:ws)
+  | c `elem` [",", "."] = w <> c <> " " <> detokenize ws
+detokenize (w:ws) = finish $ unsnoc w
+  where
+    finish Nothing = detokenize ws
+    finish (Just (_, c))
+      | c == '\'' = w <> detokenize ws
+      | otherwise = w <> " " <> detokenize ws
+
+main :: IO ()
+main = interact $ detokenize . lines
diff --git a/scripts/extract-corpus.sh b/scripts/extract-corpus.sh
new file mode 100755
index 0000000..4806dd1
--- /dev/null
+++ b/scripts/extract-corpus.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+INPUT_PATH="${1}"
+SOURCE_TEXT_ARTICLES="${2}"
+OUTPUT="${3}"
+if [ -d "${OUTPUT}" ]
+then
+	N=1
+	while [ -d "${OUTPUT}.${N}" ]
+	do
+		N=$((N+1))
+	done
+	mv "${OUTPUT}" "${OUTPUT}.${N}"
+fi
+
+WORKDIR=$(mktemp -d /tmp/parallel-corpus.XXX)
+
+while read LINE
+do
+	ID="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	RELATIVE_PATH="${LINE%%,*}"
+	cp "${SOURCE_TEXT_ARTICLES}/${RELATIVE_PATH#*/}.*" "${WORKDIR}/${RELATIVE_PATH}"
+	#cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}"
+done < <(tail -n +2 ${INPUT_METADATA})
+
+mv ${WORKDIR} ${OUTPUT}
diff --git a/scripts/extract-parallel-EDdA.sh b/scripts/extract-parallel-EDdA.sh
index b18305e..62dcf7c 100755
--- a/scripts/extract-parallel-EDdA.sh
+++ b/scripts/extract-parallel-EDdA.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 INPUT_METADATA="${1}"
 SOURCE_TEXT_ARTICLES="${2}"
@@ -25,7 +25,6 @@ do
 	LINE="${LINE#*,}"
 	LINE="${LINE#*,}"
 	LINE="${LINE#*,}"
-	LINE="${LINE#*,}"
 	T="${LINE%%,*}"
 	LINE="${LINE#*,}"
 	RANK="${LINE%%,*}"
diff --git a/scripts/extract-parallel-LGE.sh b/scripts/extract-parallel-LGE.sh
index b1c0c3a..eef78b6 100755
--- a/scripts/extract-parallel-LGE.sh
+++ b/scripts/extract-parallel-LGE.sh
@@ -24,8 +24,10 @@ while read LINE
 do
 	ID="${LINE%%,*}"
 	LINE="${LINE#*,}"
+	LINE="${LINE#*,}"
+	LINE="${LINE#*,}"
 	T="${LINE%%,*}"
-	cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}"
+	cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID##*_}."* "${WORKDIR}/T${T}"
 done < <(tail -n +2 ${INPUT_METADATA})
 
 mv ${WORKDIR} ${OUTPUT}
diff --git a/scripts/fix-tsv.hs b/scripts/fix-tsv.hs
new file mode 100755
index 0000000..749dbbd
--- /dev/null
+++ b/scripts/fix-tsv.hs
@@ -0,0 +1,26 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE OverloadedStrings #-}
+
+import Data.ByteString as BS (toStrict)
+import Data.Csv (EncodeOptions(..), encodeWith, defaultEncodeOptions)
+import Data.Text as Text (Text, splitOn)
+import Data.Text.Encoding as Text (decodeUtf8)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.Script (syntax)
+import Text.Filter (xargs)
+
+fixTsv :: [Text] -> IO Text
+fixTsv = fmap toTsv . mapM fixLine
+  where
+    toTsv = decodeUtf8 . BS.toStrict . encodeWith
+              (defaultEncodeOptions { encDelimiter = toEnum $ fromEnum '\t' })
+    fixLine = escapeFormLemma . Text.splitOn "\t"
+    escapeFormLemma [n, form, lemma, pos, ene] = pure (n, form, lemma, pos, ene)
+    escapeFormLemma l = die $ show l
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [target] = xargs fixTsv target
+    run _ = syntax "TARGET_DIRECTORY"
diff --git a/scripts/inject-metadata-to-lexicoscope.hs b/scripts/inject-metadata-to-lexicoscope.hs
new file mode 100755
index 0000000..b69ccbf
--- /dev/null
+++ b/scripts/inject-metadata-to-lexicoscope.hs
@@ -0,0 +1,48 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+
+import Data.Attoparsec.Text (parseOnly)
+import Data.Metadata.Article (Article(..))
+--import Data.Metadata (Metadata, metadata)
+import Data.Metadata (Article, articles)
+import Data.Text as Text (Text, intercalate, isPrefixOf, pack, unpack)
+import Data.Text.IO as Text (readFile)
+import System.Environment (getArgs)
+import System.FilePath ((</>), (<.>))
+import System.Exit (die)
+import System.Script (syntax)
+import Text.Editor (editM)
+
+to :: FilePath -> FilePath -> [Article] -> IO ()
+to source target = mapM_ inject
+  where
+    inject a@(Article {uid}) =
+      editM (addMeta a) target (source </> Text.unpack uid <.> "xml")
+
+addMeta :: Article -> [Text] -> IO [Text]
+addMeta (Article {uid, tome, rank, headWord, domains}) (firstLine:_:others) = pure $
+    firstLine
+  : "<corpus><doc><meta>"
+  : ("fileName\t" <> uid <> ".txt")
+  : ("tome\t" <> Text.pack (show tome))
+  : ("rank\t" <> Text.pack (show rank))
+  : ("head\t" <> headWord)
+  : ("domain\t" <> intercalate " | " domains)
+  : ("class\t" <> getClass domains)
+  : others
+metaLines _ _ = die "invalid input file"
+
+getClass :: [Text] -> Text
+getClass domainsList
+  | all hasGeo domainsList = "geography"
+  | any hasGeo domainsList = "some_geography"
+  | otherwise = "other"
+  where
+    hasGeo = ("Géographie" `isPrefixOf`)
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [inputCSV, source, target] = Text.readFile inputCSV
+      >>= either die (source `to` target) . parseOnly articles
+    run _ = syntax "METADATA_CSV_FILE SOURCE TARGET"
diff --git a/scripts/inject-metadata.sh b/scripts/inject-metadata.sh
new file mode 100755
index 0000000..b08f651
--- /dev/null
+++ b/scripts/inject-metadata.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+METADATA="${1}"
+SOURCE="${2%/}"
+TARGET="${3%/}"
+
+getClass()
+{
+	DOMAIN="${1}"
+	if [ "${DOMAIN%%|*}" == "${DOMAIN}" ]
+	then
+		case "${DOMAIN}" in
+			Géographie*) printf "geography";;
+			*) printf "other";;
+		esac
+	else
+		while true
+		do
+			case "${DOMAIN%%|*}" in
+				Géographie*)
+					if [ "${DOMAIN%%|*}" != "${DOMAIN}" ]
+					then DOMAIN="${DOMAIN#*| }"
+					else printf "geography"; return
+					fi;;
+				*) printf "some_geography"; return;;
+			esac
+		done
+	fi
+}
+
+while read LINE
+do
+	FILE="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	TOME="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	RANK="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	HEAD="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	DOMAIN="${LINE%%,*}"
+	CLASS="$(getClass "${DOMAIN}")"
+	INPUT="${SOURCE}/${FILE}.xml"
+	OUTPUT="${TARGET}/${FILE}.xml"
+	head -n 1 "${INPUT}" > "${OUTPUT}"
+
+	cat >> "${OUTPUT}" << EOF
+<corpus><doc><meta>
+fileName	${FILE}.txt
+tome	${TOME}
+rank	${RANK}
+head	${HEAD}
+domain	${DOMAIN}
+class	${CLASS}
+EOF
+
+	tail -n +3 "${INPUT}" >> "${OUTPUT}"
+done < <(tail -n +2 ${METADATA})
diff --git a/scripts/lge-articles-to-volumes.hs b/scripts/lge-articles-to-volumes.hs
new file mode 100755
index 0000000..7e47255
--- /dev/null
+++ b/scripts/lge-articles-to-volumes.hs
@@ -0,0 +1,153 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+
+--import Control.Map
+import Data.Attoparsec.Text (parseOnly)
+import Data.List as List (foldl')
+import Data.Map as Map (Map, elems, empty, insertWith, toList)
+import Data.Metadata (Article(..), Tome, Tomes, byTome)
+import Data.Set as Set (Set, singleton, union)
+import Data.Text as Text (Text, concat, intercalate, splitOn, unpack)
+import Data.Text.IO as Text (readFile)
+import System.Directory (listDirectory)
+import System.Environment (getArgs, getProgName)
+import System.Exit (die)
+import System.FilePath ((</>), (<.>), dropTrailingPathSeparator, takeBaseName)
+import System.Script (syntax)
+import Text.XML.HXT.Core (
+    (>>>), ($<), Arrow, ArrowIO, ArrowXml, IOStateArrow, XmlTree, arr, arrIO0, attr, constL, getChildren, mkelem, readDocument, runX, selem, txt, unlistA, withValidate, writeDocument
+  )
+
+{-
+getTome :: FilePath -> String
+getTome = drop 1 . takeBaseName . dropTrailingPathSeparator
+-}
+
+type IOArrow b x = IOStateArrow () b x
+
+compileTome :: FilePath -> FilePath -> (Int, Tome) -> IOArrow b XmlTree
+compileTome source target (tomeNumber, metadata) =
+  tomeXml >>> writeDocument [] (target </> "T" <> strTome <.> "xml")
+  where
+    strTome = show tomeNumber
+    tomeXml = selem "/"
+                [selem "teiCorpus"
+                  [teiHeader
+                    (titleStmt strTome)
+                    (corpusPublicationStmt)
+                    (corpusSourceDesc strTome)
+                  ,(articleFrom source $< constL (Map.elems metadata))]]
+
+        --, constL (articleFrom source <$> metadata) >>> undefined]]
+
+articleFrom :: FilePath -> Article -> IOArrow b XmlTree
+articleFrom source (Article {tome, uid, headWord, author, artflDomains, enccreDomains}) =
+  selem "TEI"
+    [teiHeader
+      (selem "titleStmt" [title $ Text.unpack headWord])
+      (publicationStmt
+        [selem "p" [txt "Annotated with Stanza by project GEODE"]])
+      (sourceDesc [selem "author" [txt $ Text.unpack author]])
+    , mkelem "text" metadata [body]]
+  where
+    path = source </> "T" <> show tome </> Text.unpack uid <.> "tei"
+    body = readDocument [withValidate False] path >>> getChildren
+    key .= value = attr key (txt $ Text.unpack value)
+    aDomainsAttr = Text.concat [":", Text.intercalate ":" artflDomains, ":"]
+    eDomainsAttr = Text.concat [":", Text.intercalate ":" enccreDomains, ":"]
+    metadata =
+      ["uid" .= uid, "author" .= author, "artflDomains" .= aDomainsAttr, "enccreDomains" .= eDomainsAttr, "book" .= "LGE"]
+
+--    arrIO0 (listDirectory source)
+--  >>> unlistA
+--  >>> arr (source </>)
+--  >>> readFromDocument [withValidate no]
+
+{-
+getTEICorpus :: ArrowXml a => String -> a b XmlTree -> a b XmlTree
+getTEICorpus tome corpus = selem "/" [selem "teiCorpus" [teiHeader tome, corpus]]
+-}
+
+teiHeader :: ArrowXml a =>
+  a b XmlTree -> a b XmlTree -> a b XmlTree -> a b XmlTree
+teiHeader aTitle aPublication aSource =
+  selem "teiHeader"
+    [selem "fileDesc" [aTitle, aPublication, aSource]]
+    --selem "fileDesc" [titleStmt tome, publicationStmt, sourceDesc tome]
+  --]
+
+titleStmt :: ArrowXml a => String -> a b XmlTree
+titleStmt tome = selem "titleStmt" [
+      title tome
+    , selem "respStmt" [
+            selem "resp" [txt "Digitized by"]
+          , selem "orgName" [txt "Bibliothèque Nationale de France"]
+          , selem "resp" [txt "Encoded by"]
+          , selem "orgName" [txt "ICAR"]
+        ]
+  ]
+
+publicationStmt :: ArrowXml a => [a b XmlTree] -> a b XmlTree
+publicationStmt content = selem "publicationStmt" content
+
+corpusPublicationStmt :: ArrowXml a => a b XmlTree
+corpusPublicationStmt =
+  (publicationStmt
+    [selem "distributor"
+      [selem "orgName" [txt "Project GÉODE"]
+      ,selem "address"
+          [selem "addrline" [txt "ICAR UMR 5191"]
+          ,selem "addrline" [txt "ENS de Lyon"]]]])
+
+sourceDesc :: ArrowXml a => [a b XmlTree] -> a b XmlTree
+sourceDesc biblContent =
+  selem "sourceDesc" [selem "bibl" biblContent]
+
+corpusSourceDesc :: ArrowXml a => String -> a b XmlTree
+corpusSourceDesc tome =
+  sourceDesc 
+    [title tome
+    ,selem "author" [txt "Collective"]
+    ,selem "creation"
+      [selem "date" [txt "1885"]]
+    ,selem "imprint"
+      [selem "date" [txt "1885"]
+      ,selem "publisher" [txt "H. Lamirault et Cie,"]
+      ,selem "pubplace" [txt "Paris"]]
+    ,mkelem "biblScope" [attr "unit" (txt "volume")] [txt tome]]
+
+title :: ArrowXml a => String -> a b XmlTree
+title titleContent = selem "title" [txt titleContent]
+
+{-
+$ unlines [
+      "La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et"
+    , "des arts par une société de savants et de gens de lettres. Tome " ++ tome
+  ]]
+-}
+
+{-
+getTomes :: ArrowIO a => FilePath -> a b Tome
+getTomes metadata = constA undefined
+
+getTomes :: FilePath -> IO Tomes
+getTomes =
+  fmap (foldl' index Map.empty . fmap fields . drop 1 . lines) . Text.readFile
+  where
+    fields = Text.splitOn ","
+    index tomes [file, tome, rank, head, domain] =
+      Map.insertWith Set.union tome (Set.singleton ) tomes
+    index tomes _ = tomes
+    
+-}
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    cli [sourceMeta, sourceRoot, targetRoot] =
+        Text.readFile sourceMeta
+      >>= pure . parseOnly byTome
+      >>= either
+            die
+            (mapM_ (runX . compileTome sourceRoot targetRoot) . Map.toList)
+    cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/linearize.hs b/scripts/linearize.hs
index 125d393..7ad3e0e 100755
--- a/scripts/linearize.hs
+++ b/scripts/linearize.hs
@@ -1,10 +1,9 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
 
 import Data.Char (isUpper)
 import System.Environment (getArgs)
-import System.FilePath ((</>))
 import System.Script (syntax)
-import Text.Editor (editAll)
+import Text.Filter (xargs)
 
 linearize :: String -> String
 linearize "" = ""
@@ -18,5 +17,5 @@ linearize (c:s) = c : linearize s
 main :: IO ()
 main = getArgs >>= cli
   where
-    cli [target] = getContents >>= editAll linearize target
+    cli [target] = xargs (pure.linearize) target
     cli _ = syntax "TARGET_DIR"
diff --git a/scripts/parallel-links.py b/scripts/parallel-links.py
new file mode 100755
index 0000000..284dc5d
--- /dev/null
+++ b/scripts/parallel-links.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+
+import csv
+import sys
+
+def EDdARow(columns):
+    return {
+            'tome': columns[0],
+            'article': columns[1],
+            'head': columns[2],
+            }
+
+def LGERow(tome):
+    return lambda columns: {
+            'id': columns[0],
+            'tome': tome,
+            'rank': columns[1],
+            'head': columns[2]
+        }
+
+def unserializeMetadata(path, rowReader):
+    with open(path) as f:
+        inputFile = csv.reader(f)
+        articles = []
+        header = True
+        for row in inputFile:
+            if header:
+                header = False
+            else:
+                yield rowReader(row)
+
+def concat(generators):
+    for g in generators:
+        for x in g:
+            yield x
+
+def naiveIndexBy(field, elements):
+    d = {}
+    for e in elements:
+        key = e[field]
+        if key in d:
+            d[key] = None
+        else:
+            d[key] = e
+    return d
+
+def growPrefixes(d, keys, maxLength):
+    for length in range(maxLength, 1, -1):
+        newGeneration = {}
+        for key in keys:
+            if len(key) == length:
+                newKey = key[:-1]
+                if newKey in newGeneration or newKey in d:
+                    newGeneration[newKey] = None
+                else:
+                    newGeneration[newKey] = d[key]
+        for key, value in newGeneration.items():
+            if value is not None:
+                d[key] = value
+                keys.add(key)
+            elif key not in d:
+                d[key] = value
+
+def indexBy(field, elements, prefix=True):
+    d = {}
+    for e in elements:
+        key = e[field]
+        if key in d:
+            d[key] = None
+        else:
+            d[key] = e
+    if prefix:
+        keys = set(d.keys())
+        growPrefixes(d, keys, max(map(len, keys)))
+    return d
+
+def headWords(head):
+    words = head.split()
+    if len(words) == 1:
+        return words
+    else:
+        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
+
+def identify(head, haystack):
+    if head in haystack:
+        if haystack[head] is not None:
+            return {'type': 'exact', 'match': head, 'found': haystack[head]}
+        else:
+            return None
+    else:
+        prefix = head[:-1]
+        while len(prefix) > 0 and prefix not in haystack:
+            prefix = prefix[:-1]
+        if prefix in haystack and haystack[prefix] is not None:
+            return {
+                    'type': 'prefix',
+                    'match': head,
+                    'found': haystack[prefix],
+                    'precision': len(prefix) / len(head)
+                    }
+        else:
+            return None
+
+def naiveGetArrows(source, target):
+    indexedSource = naiveIndexBy('head', source)
+    indexedTarget = naiveIndexBy('head', target)
+    for head, article in indexedSource.items():
+        if article is not None and head in indexedTarget and indexedTarget[head] is not None:
+            yield {
+                    'source': article,
+                    'target': indexedTarget[head]
+                    }
+
+def getArrows(source, target):
+    for article in source:
+        heads = headWords(article['head'])
+        identified = map(lambda w: identify(w, target), heads)
+        entries = [e for e in identified if e is not None]
+        if len(entries) == 1:
+            yield {
+                    'type': 'match',
+                    'source': article,
+                    'target': entries[0]
+                    }
+        elif len(entries) > 1:
+            yield {
+                    'type': 'ambiguity',
+                    'source': article,
+                    'target': entries
+                    }
+
+def interesting(arrow):
+    if arrow['type'] == 'match':
+        target = arrow['target']
+        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
+
+#gold = [a for a in arrows if interesting(a)]
+
+def getMetadata(arrows, path=None):
+    output = sys.stdout if path is None else open(path, 'w')
+    toCsv = csv.writer(output, lineterminator='\n')
+    toCsv.writerow(
+            ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE']
+            )
+    for arrow in arrows:
+        toCsv.writerow([
+            arrow['target']['head'],
+            arrow['target']['id'],
+            arrow['source']['tome'],
+            arrow['source']['article'],
+            arrow['target']['tome'],
+            arrow['target']['rank']
+            ])
+
+if __name__ == '__main__':
+    edda = list(unserializeMetadata(sys.argv[1], EDdARow))
+    lge = list(concat([
+        unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
+    ]))
+    getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])
diff --git a/scripts/parallel-metadata.hs b/scripts/parallel-metadata.hs
new file mode 100644
index 0000000..eb647e6
--- /dev/null
+++ b/scripts/parallel-metadata.hs
@@ -0,0 +1,15 @@
+type Link = (String, String, Int, Int, Int, Int)
+--(headWord, uid, tomeEDdA, rankEDda, tomeLGE, rankLGE)
+type Source_Meta = (Int, Int, String, Authors, Domains, Domains)
+--(T,article,head,author,ARTFL_domain,ENCCRE_domain)
+
+join :: 
+
+lgeMeta ::  -> Article
+lgeMeta (headWord, uid, tomeEDdA, rankEDda, tomeLGE, rankLGE)
+
+Right edda <- (decode HasHeader <$> B.readFile "../EDdA/metadata.csv" :: IO (Either String (Data.Vector.Vector (String, String, Int, Int, Int, Int))))
+Right edda <- (decode HasHeader <$> B.readFile "../EDdA/metadata.csv" :: IO (Either String (Data.Vector.Vector (String, String, Int, Int, Int, Int))))
+
+main :: IO ()
+main =
diff --git a/scripts/parallel-metadata.py b/scripts/parallel-metadata.py
new file mode 100755
index 0000000..6f516a3
--- /dev/null
+++ b/scripts/parallel-metadata.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+import pandas
+from sys import argv
+
+def edda_parallel_line(row):
+    return ('edda_' + str(row.tomeEDdA) + '_' + str(row.article),
+            'EDdA/T' + str(row.tomeEDdA) + '/article' + str(row.article),
+            'EDdA',
+            row.tomeEDdA,
+            row.article,
+            row['head'],
+            row.author,
+            row.ENCCRE_domain)
+
+def lge_parallel_line(row):
+    return ('lge_' + str(row.tomeLGE) + '_' + str(row.id),
+            'LGE/T' + str(row.tomeLGE) + '/' + str(row.id),
+            'LGE',
+            row.tomeLGE,
+            row.rankLGE,
+            row['head'],
+            '',
+            row.ENCCRE_domain)
+
+def merge(edda_meta, edda_lge_mapping):
+    return pandas.merge(
+            edda_meta,
+            edda_lge_mapping,
+            how='inner',
+            on='head')
+
+def output_metadata(input_meta, row_mapper):
+    output = input_meta.apply(row_mapper, result_type='expand', axis=1)
+    output.columns = ['uid', 'path', 'book', 'tome', 'rank', 'head', 'authors', 'domains']
+    return output
+
+if __name__ == '__main__':
+    if len(argv) == 4:
+        parallel_meta = merge(pandas.read_csv(argv[1]), pandas.read_csv(argv[2]))
+        edda_output = f"{argv[3]}/edda_parallel_metadata.csv"
+        lge_output = f"{argv[3]}/lge_parallel_metadata.csv"
+        output_metadata(parallel_meta, edda_parallel_line).to_csv(edda_output, index=False)
+        output_metadata(parallel_meta, lge_parallel_line).to_csv(lge_output, index=False)
+    else:
+        print(f"Syntax: {argv[0]} EDdA_metadata EDdA_LGE_mapping OUTPUT_DIRECTORY")
diff --git a/scripts/reroot.hs b/scripts/reroot.hs
index fc5e715..1b67fc3 100755
--- a/scripts/reroot.hs
+++ b/scripts/reroot.hs
@@ -1,10 +1,12 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
 {-# LANGUAGE OverloadedStrings #-}
 
-import Data.Text (Text)
+import Data.Text (Text, lines, unlines)
+import Data.Text.IO (interact)
+import Prelude hiding (interact, lines, unlines)
 import System.Environment (getArgs)
 import System.Script (syntax)
-import Text.Editor (editAll)
+import Text.Filter (xargs)
 
 wrap :: [Text] -> [Text]
 wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"]
@@ -12,5 +14,6 @@ wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"]
 main :: IO ()
 main = getArgs >>= run
   where
-    run [target] = getContents >>= editAll wrap target
+    run ["-"] = interact (unlines . wrap . lines)
+    run [target] = xargs (pure.wrap) target
     run _ = syntax "TARGET_DIRECTORY"
diff --git a/scripts/spacy-to-txm.hs b/scripts/spacy-to-txm.hs
new file mode 100755
index 0000000..1d8e05a
--- /dev/null
+++ b/scripts/spacy-to-txm.hs
@@ -0,0 +1,65 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE NamedFieldPuns #-}
+module Main where
+
+import Data.BIO (Tree(..), bio)
+import Data.Metadata
+  ( HasAuthors(..), Book(..), groupBy, readTsv, relativePath )
+import Data.Metadata.Paragraph (Paragraph(..))
+import Data.SpaCy (Annotation(..))
+import Data.Text as Text (unpack)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>), (<.>))
+import System.Script (syntax)
+import Text.TEI
+  ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
+import Text.XML.HXT.Core
+  ( (>>>), (|||), ($<), IOSArrow, XmlTree, arr, arrIO, arrIO0, attr, constL
+  , mkelem, mkText, runX, selem, txt, withIndent, writeDocument )
+
+to :: FilePath -> FilePath -> (Int, [Paragraph]) -> IO ()
+to source target = fmap (\_ -> ()) . runX . compileTome source target
+
+compileTome :: FilePath -> FilePath -> (Int, [Paragraph]) -> IOSArrow b XmlTree
+compileTome source target (tomeNumber, metadata) =
+  tomeXml >>> writeDocument [withIndent True] output
+  where
+    strTome = show tomeNumber
+    output = target </> "T" <> strTome <.> "xml"
+    tomeXml = selem "/"
+                [ selem "teiCorpus"
+                  [ corpusHeader EDdA strTome
+                  , (paragraphFrom source $< constL metadata) ] ]
+
+paragraphFrom :: FilePath -> Paragraph -> IOSArrow b XmlTree
+paragraphFrom source paragraph@(Paragraph {headWord, paragraphId}) =
+  selem "TEI"
+    [ teiHeader
+        (selem "titleStmt" [title $ headWord <> " §" <> show paragraphId])
+        (publicationStmt
+          [ selem "p" [ txt "Annotated with SpaCy by project GEODE" ] ])
+        (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
+    , text "paragraph" paragraph (loadTsv input) ]
+  where
+    input = source </> relativePath paragraph <.> "tsv"
+    authorArrow = constL (Text.unpack <$> authors paragraph)
+
+loadTsv :: FilePath -> IOSArrow b XmlTree
+loadTsv input = arrIO0 (readTsv input) >>> ((arrIO die) ||| format)
+  where
+    format = toXml $< arr (bio namedEntity)
+    toXml (Root nodes) = selem "body" [ toXml $< constL nodes ]
+    toXml (Leaf (Annotation {lemma, pos, form})) =
+      mkelem "w" [ attr "lemma" (txt lemma)
+                 , attr "pos" (txt pos) ]
+                 [ txt form ]
+    toXml (Node namedEntity nodes) =
+      mkelem "rs" [ attr "type" (txt namedEntity) ] [ toXml $< constL nodes ]
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    cli [datasetPath, source, target] = readTsv datasetPath
+      >>= either die (mapM_ (source `to` target) . groupBy tome)
+    cli _ = syntax "DATASET_TSV SOURCE_DIR TARGET_DIR"
diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py
deleted file mode 100755
index c6f4d5e..0000000
--- a/scripts/stanza-annotator.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import os.path
-import stanza
-import sys
-from xml.sax.saxutils import escape, quoteattr
-
-class Annotator:
-    def __init__(self, source, target):
-        self.source = source
-        self.target = target
-        self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')
-
-    def run(self):
-        for name in os.listdir(self.source):
-            if os.path.isdir(f'{self.source}/{name}'):
-                self.annotate_dir(name)
-
-    def annotate_dir(self, directory):
-        source_path = f'{self.source}/{directory}'
-        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
-        for name in os.listdir(source_path):
-            path = f'{source_path}/{name}'
-            relpath = f'{directory}/{name}'
-            if os.path.isdir(path):
-                self.annotate_dir(relpath)
-            elif os.path.isfile(path):
-                self.annotate_file(relpath)
-
-    def annotate_file(self, file):
-        with open(f'{self.source}/{file}', 'r') as source:
-            self.encode(f'{file[:-4]}.tei', self.model(source.read()))
-
-    def encode(self, file, annotation):
-        with open(f'{self.target}/{file}', 'w') as target:
-            print('<body>', file=target)
-            for sentence in annotation.sentences:
-                self.encode_sentence(sentence, target)
-            print('</body>', file=target)
-
-    def encode_sentence(self, sentence, target):
-        print('\t<s>', file=target)
-        for token in sentence.tokens:
-            self.encode_token(token, target)
-        print('\t</s>', file=target)
-
-    def encode_token(self, token, target):
-        form = escape(token.text)
-        lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words)))
-        upos = quoteattr('+'.join(map(lambda w: w.upos, token.words)))
-        print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target)
-
-if __name__ == '__main__':
-    Annotator(sys.argv[1], sys.argv[2]).run()
diff --git a/scripts/stanza-txt-annotator.py b/scripts/stanza-txt-annotator.py
new file mode 100755
index 0000000..5ebcbf0
--- /dev/null
+++ b/scripts/stanza-txt-annotator.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import os
+import os.path
+import stanza
+from stanza.utils.conll import CoNLL
+import sys
+
+def oneLine(sentence):
+    return sentence.text.replace('\n', ' ').replace('\t', ' ')
+
+class Annotator:
+    def __init__(self, source, target):
+        self.source = source
+        self.target = target
+        self.paragraphSerial = -1
+        self.sentenceSerial = -1
+        self.model = stanza.Pipeline(
+                lang='fr',
+                processors='tokenize,pos,lemma,depparse')
+
+    def run(self):
+        for name in os.listdir(self.source):
+            if os.path.isdir(f'{self.source}/{name}'):
+                self.annotate_dir(name)
+
+    def annotate_dir(self, directory):
+        source_path = f'{self.source}/{directory}'
+        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
+        for name in os.listdir(source_path):
+            path = f'{source_path}/{name}'
+            relpath = f'{directory}/{name}'
+            if os.path.isdir(path):
+                self.annotate_dir(relpath)
+            elif os.path.isfile(path):
+                self.annotate_file(relpath)
+
+    def annotate_file(self, file):
+        outputFile = f'{self.target}/{file[:-4]}.conllu'
+        if not os.path.isfile(outputFile):
+            with open(f'{self.source}/{file}', 'r') as source:
+                document = source.read()
+                print("trying " + outputFile)
+                try:
+                    with open(outputFile, 'w') as target:
+                        for paragraph in document.split('\n\n'):
+    #                        print(f'# newpar id = {self.newpar()}', file=target)
+                            self.annotate_paragraph(paragraph, target)
+                except Exception:
+                    os.unlink(outputFile)
+                    sys.exit(1)
+                    
+
+    def annotate_paragraph(self, paragraph, target):
+        parsed = self.model(paragraph)
+        if len(parsed.sentences) > 0:
+            print(f'# newpar id = {self.newpar()}', file=target)
+            for sentence in parsed.sentences:
+                sentence.add_comment(f'# sent_id = {self.newsent()}')
+                sentence.add_comment(f'# text = {oneLine(sentence)}')
+            print(CoNLL.doc2conll_text(parsed), file=target, end='')
+
+#    def annotate_paragraph(self, paragraph, target):
+#        parsed = self.model(paragraph)
+#        for sentence in parsed.sentences:
+#            sentence.add_comment(f'# sent_id = {self.newsent()}')
+#            sentence.add_comment(f'# text = {oneLine(sentence)}')
+#        print(CoNLL.doc2conll_text(parsed), file=target, end='')
+
+    def newpar(self):
+        self.paragraphSerial += 1
+        return self.paragraphSerial
+
+    def newsent(self):
+        self.sentenceSerial += 1
+        return self.sentenceSerial
+
+if __name__ == '__main__':
+    Annotator(sys.argv[1], sys.argv[2]).run()
diff --git a/scripts/stanza-xml-annotator.py b/scripts/stanza-xml-annotator.py
new file mode 100755
index 0000000..b8ee15e
--- /dev/null
+++ b/scripts/stanza-xml-annotator.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+from lxml import etree
+import os
+import os.path
+import stanza
+from stanza.utils.conll import CoNLL
+import sys
+
+def oneLine(sentence):
+    return sentence.text.replace('\n', ' ').replace('\t', ' ')
+
+class Annotator:
+    def __init__(self, source, target, xpath):
+        self.source = source
+        self.target = target
+        self.xpath = xpath
+        self.paragraphSerial = -1
+        self.sentenceSerial = -1
+        self.model = stanza.Pipeline(
+                lang='fr',
+                processors='tokenize,pos,lemma,depparse')
+
+    def run(self):
+        for name in os.listdir(self.source):
+            if os.path.isdir(f'{self.source}/{name}'):
+                self.annotate_dir(name)
+
+    def annotate_dir(self, directory):
+        source_path = f'{self.source}/{directory}'
+        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
+        for name in os.listdir(source_path):
+            path = f'{source_path}/{name}'
+            relpath = f'{directory}/{name}'
+            if os.path.isdir(path):
+                self.annotate_dir(relpath)
+            elif os.path.isfile(path):
+                self.annotate_file(relpath)
+
+    def annotate_file(self, file):
+        outputFile = f'{self.target}/{file[:-4]}.conllu'
+        if not os.path.isfile(outputFile):
+            document = etree.parse(f'{self.source}/{file}')
+            with open(outputFile, 'w') as target:
+                for paragraph in document.xpath(self.xpath):
+                    self.annotate_paragraph(paragraph, target)
+
+    def annotate_paragraph(self, paragraph, target):
+        parsed = self.model(" ".join(paragraph.itertext()))
+        if len(parsed.sentences) > 0:
+            print(f'# newpar id = {self.newpar()}', file=target)
+            for sentence in parsed.sentences:
+                sentence.add_comment(f'# sent_id = {self.newsent()}')
+                sentence.add_comment(f'# text = {oneLine(sentence)}')
+            print(CoNLL.doc2conll_text(parsed), file=target, end='')
+
+    def newpar(self):
+        self.paragraphSerial += 1
+        return self.paragraphSerial
+
+    def newsent(self):
+        self.sentenceSerial += 1
+        return self.sentenceSerial
+
+if __name__ == '__main__':
+    paragraphXPath = '/TEI/text/body/div1/p'
+    if len(sys.argv) > 3:
+        paragraphXPath = sys.argv[3]
+    Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run()
diff --git a/scripts/xml-annotator.py.bak b/scripts/xml-annotator.py.bak
new file mode 100755
index 0000000..5ac322a
--- /dev/null
+++ b/scripts/xml-annotator.py.bak
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+from lxml import etree
+import os
+import os.path
+import stanza
+from stanza.utils.conll import CoNLL
+import sys
+
+def oneLine(sentence):
+    return sentence.text.replace('\n', ' ')
+
+class Annotator:
+    def __init__(self, source, target, xpath):
+        self.source = source
+        self.target = target
+        #self.parser = etree.HTMLParser()
+        self.xpath = xpath
+        self.paragraphSerial = -1
+        self.sentenceSerial = -1
+        self.model = stanza.Pipeline(
+                lang='fr',
+                processors='tokenize,pos,lemma,depparse')
+
+    def run(self):
+        for name in os.listdir(self.source):
+            if os.path.isdir(f'{self.source}/{name}'):
+                self.annotate_dir(name)
+
+    def annotate_dir(self, directory):
+        source_path = f'{self.source}/{directory}'
+        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
+        for name in os.listdir(source_path):
+            path = f'{source_path}/{name}'
+            relpath = f'{directory}/{name}'
+            if os.path.isdir(path):
+                self.annotate_dir(relpath)
+            elif os.path.isfile(path):
+                self.annotate_file(relpath)
+
+    def annotate_file(self, file):
+        outputFile = f'{self.target}/{file[:-4]}.conllu'
+        if not os.path.isfile(outputFile):
+            document = etree.parse(f'{self.source}/{file}')
+            #document = etree.parse(f'{self.source}/{file}', self.parser)
+            with open(outputFile, 'w') as target:
+                for paragraph in document.xpath(self.xpath):
+                    print(f'# newpar id = {self.newpar()}', file=target)
+                    self.annotate_paragraph(paragraph, target)
+
+    def annotate_paragraph(self, paragraph, target):
+        parsed = self.model(" ".join(paragraph.itertext()))
+        for sentence in parsed.sentences:
+            sentence.add_comment(f'# sent_id = {self.newsent()}')
+            sentence.add_comment(f'# text = {oneLine(sentence)}')
+        print(CoNLL.doc2conll_text(parsed), file=target)
+
+    def newpar(self):
+        self.paragraphSerial += 1
+        return self.paragraphSerial
+
+    def newsent(self):
+        self.sentenceSerial += 1
+        return self.sentenceSerial
+
+if __name__ == '__main__':
+    paragraphXPath = '/TEI/text/body/div1/p'
+    if len(sys.argv) > 3:
+        paragraphXPath = sys.argv[3]
+    Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run()
diff --git a/scripts/xml-annotator.py.remote b/scripts/xml-annotator.py.remote
new file mode 100755
index 0000000..1db6bf2
--- /dev/null
+++ b/scripts/xml-annotator.py.remote
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+from lxml import etree
+import os
+import os.path
+import stanza
+from stanza.utils.conll import CoNLL
+import sys
+
+def oneLine(sentence):
+    return sentence.text.replace('\n', ' ')
+
+class Annotator:
+    def __init__(self, source, target, xpath):
+        self.source = source
+        self.target = target
+        self.xpath = xpath
+        self.paragraphSerial = -1
+        self.sentenceSerial = -1
+        self.model = stanza.Pipeline(
+                lang='fr',
+                processors='tokenize,pos,lemma,depparse')
+
+    def run(self):
+        for name in os.listdir(self.source):
+            if os.path.isdir(f'{self.source}/{name}'):
+                self.annotate_dir(name)
+
+    def annotate_dir(self, directory):
+        source_path = f'{self.source}/{directory}'
+        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
+        for name in os.listdir(source_path):
+            path = f'{source_path}/{name}'
+            relpath = f'{directory}/{name}'
+            if os.path.isdir(path):
+                self.annotate_dir(relpath)
+            elif os.path.isfile(path):
+                self.annotate_file(relpath)
+
+    def annotate_file(self, file):
+        with open(f'{self.source}/{file}', 'r') as source:
+            document = etree.parse(source)
+            with open(f'{self.target}/{file[:-4]}.conllu', 'w') as target:
+                for paragraph in document.findall(self.xpath):
+                    print(f'# newpar id = {self.newpar()}', file=target)
+                    self.annotate_paragraph(paragraph, target)
+
+    def annotate_paragraph(self, paragraph, target):
+        parsed = self.model(" ".join(paragraph.itertext()))
+        for sentence in parsed.sentences:
+            sentence.add_comment(f'# sent_id = {self.newsent()}')
+            sentence.add_comment(f'# text = {oneLine(sentence)}')
+        print(CoNLL.doc2conll_text(parsed), file=target)
+
+    def newpar(self):
+        self.paragraphSerial += 1
+        return self.paragraphSerial
+
+    def newsent(self):
+        self.sentenceSerial += 1
+        return self.sentenceSerial
+
+if __name__ == '__main__':
+    paragraphXPath = 'text/body/div1/p'
+    if len(sys.argv) > 3:
+        paragraphXPath = sys.argv[3]
+    Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run()
-- 
GitLab