From 1e28d5936765d96f668daacef3f2c7e84f408001 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Fri, 16 Jun 2023 15:39:19 +0200
Subject: [PATCH] Save everything that can be saved after disk crash

---
 lib/Data/Metadata.hs                          |  38 +++-
 lib/Data/Metadata/Article.hs                  |   1 +
 lib/Data/Metadata/Trie.hs                     |  17 +-
 lib/System/Script.hs                          |  14 +-
 lib/Text/TEI.hs                               |  16 +-
 manifest.scm                                  |   4 +
 scripts/EDdA/extract-from-source.sh           |  25 +++
 scripts/conllu-to-lexicoscope.hs              |  45 ++---
 scripts/conllu-to-txm.hs                      |  54 +++---
 scripts/extract-corpus.sh                     |   2 +-
 scripts/extract-first-word.hs                 | 175 ++++++++++++++++++
 scripts/extract-first-word_lge-mode-backup.hs |  76 ++++++++
 scripts/extract-from-source.sh                |  20 ++
 scripts/lib.sh                                |   4 +
 scripts/merge.hs                              |  28 +++
 scripts/parallel-links.hs                     |  36 ++++
 scripts/parallel-links.py                     | 170 ++++++++---------
 scripts/select.hs                             |  27 +++
 scripts/subcorpus/getFiles.hs                 |  29 +++
 scripts/subcorpus/getTXMQuery.hs              |  21 +++
 20 files changed, 648 insertions(+), 154 deletions(-)
 create mode 100755 scripts/EDdA/extract-from-source.sh
 create mode 100644 scripts/extract-first-word.hs
 create mode 100644 scripts/extract-first-word_lge-mode-backup.hs
 create mode 100755 scripts/extract-from-source.sh
 create mode 100644 scripts/lib.sh
 create mode 100755 scripts/merge.hs
 create mode 100755 scripts/parallel-links.hs
 create mode 100755 scripts/select.hs
 create mode 100755 scripts/subcorpus/getFiles.hs
 create mode 100755 scripts/subcorpus/getTXMQuery.hs

diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs
index 8af810b..d8ef60a 100644
--- a/lib/Data/Metadata.hs
+++ b/lib/Data/Metadata.hs
@@ -13,13 +13,16 @@ module Data.Metadata (
     , list
     , readTsv
     , sortBy
+    , tsvFile
+    , tsvLines
   ) where
 
-import Data.ByteString.Lazy as ByteString (readFile, writeFile)
+import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile)
 import Data.ByteString.Char8 as StrictByteString (pack)
 import Data.Csv
-  ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..), ToRecord
-  , decodeWith, defaultEncodeOptions, encodeWith, header, namedField )
+  ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..)
+  , ToNamedRecord, ToRecord, decodeWith, defaultEncodeOptions, encodeByNameWith
+  , encodeWith, header )
 import Data.Foldable as Foldable (toList)
 import Data.List (sortOn)
 import Data.Map.Strict as Map (alter, empty, toList)
@@ -33,10 +36,33 @@ list :: [Text] -> String
 list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":"
 
 readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a))
-readTsv source = decodeWith tsv HasHeader <$> ByteString.readFile source
+readTsv source = decodeWith fromTsv HasHeader <$> ByteString.readFile source
   where
-    tsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
+    fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
 
+{-
+toTsv :: (Foldable t, ToNamedRecord a) => Maybe [String] -> t a -> ByteString
+toTsv fields content =
+  encodeByNameWith tsv (header bsFields)
+  $ zipWith namedField bsFields (Foldable.toList content)
+  where
+    bsFields = StrictByteString.pack <$> fields
+-}
+
+toTsv :: EncodeOptions
+toTsv = defaultEncodeOptions
+        { encDelimiter = fromIntegral (fromEnum '\t')
+        , encUseCrLf = False }
+
+tsvFile :: ToNamedRecord a => FilePath -> [String] -> [a] -> IO ()
+tsvFile target fields =
+  ByteString.writeFile target
+  . encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields)
+
+tsvLines :: ToRecord a => [a] -> IO ()
+tsvLines = ByteString.putStr . encodeWith toTsv
+
+{-
 writeTsv :: (Foldable t, ToRecord a) => [String] -> t a -> FilePath -> IO ()
 writeTsv fields content target =
   ByteString.writeFile target
@@ -47,6 +73,8 @@ writeTsv fields content target =
     tsv = defaultEncodeOptions
             { encDelimiter = fromIntegral (fromEnum '\t')
             , encUseCrLf = False }
+-}
+
 
 sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a]
 sortBy field = sortOn field . Foldable.toList
diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs
index ba8c694..505bc10 100644
--- a/lib/Data/Metadata/Article.hs
+++ b/lib/Data/Metadata/Article.hs
@@ -22,6 +22,7 @@ data Article = Article
 
 instance FromRecord Article
 instance ToRecord Article
+--instance ToNamedRecord Article
 
 instance Unique Article where
   uid (Article {articleBook, tome, name}) =
diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs
index 2383ad5..3ad51a6 100644
--- a/lib/Data/Metadata/Trie.hs
+++ b/lib/Data/Metadata/Trie.hs
@@ -4,7 +4,7 @@ module Data.Metadata.Trie
 
 import Data.Foldable (toList)
 import Data.Map (Map)
-import qualified Data.Map as Map (empty, insert, lookup, singleton)
+import qualified Data.Map as Map (delete, empty, insert, lookup, singleton)
 
 data Trie e n =
   Trie
@@ -43,12 +43,19 @@ at trie = at_ trie . toList
     at_ (Trie {store}) [] = store
     at_ (Trie {edges}) (e:es) = Map.lookup e edges >>= (`at_` es)
 
-insert :: Foldable t => t e -> n -> Trie e n -> Trie e n
-insert path n trie = zipUp (edit (trie, Top))
+insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n
+insert path n trie = zipUp (edit (toList path) (trie, Top))
   where
-    edit 
+    edit [] (trie, zipper) = (trie {store = Just n}, zipper)
+    edit (e:es) (Trie {store, edges}, above) =
+      let (subTree, otherEdges) = partition e edges in
+      edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above})
+    partition e edges =
+      case Map.lookup e edges of
+        Just subTree -> (subTree, Map.delete e edges)
+        _ -> (empty, Map.empty)
 
 type Indexed e n = ([e], n)
 
-index :: (Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n
+index :: (Ord e, Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n
 index by = foldr (uncurry insert) empty . fmap by
diff --git a/lib/System/Script.hs b/lib/System/Script.hs
index 2455009..5f54942 100644
--- a/lib/System/Script.hs
+++ b/lib/System/Script.hs
@@ -1,12 +1,20 @@
-module System.Script (
-    syntax
-  ) where
+module System.Script
+  ( syntax
+  , try
+  , warn ) where
 
 import System.Exit (die)
 import System.Environment (getProgName)
+import System.IO (hPutStrLn, stderr)
 import Text.Printf (printf)
 
 syntax :: String -> IO ()
 syntax s = do
   this <- getProgName
   die $ printf "Syntax: %s %s" this s
+
+try :: IO (Either String a) -> IO a
+try = (>>= either die pure)
+
+warn :: String -> IO ()
+warn = hPutStrLn stderr
diff --git a/lib/Text/TEI.hs b/lib/Text/TEI.hs
index 7514df9..ba66a30 100644
--- a/lib/Text/TEI.hs
+++ b/lib/Text/TEI.hs
@@ -7,10 +7,10 @@ module Text.TEI
   , text
   , title ) where
 
-import Data.Metadata
-  ( Book(..), FromBook(..), HasAuthors(..), HasDomains(..), TXMText, Unique(..)
-  , list )
-import Text.XML.HXT.Core ( ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt )
+import Data.Text (unpack)
+import GEODE.Metadata
+  (Book(..), Contrastive(..), Has(..), PrimaryKey(..), formatList, uid)
+import Text.XML.HXT.Core (ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt)
 
 corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree
 corpusHeader EDdA = eddaHeader
@@ -85,15 +85,17 @@ by :: ArrowXml a => String -> String -> [a b XmlTree]
 by resp orgName = [ selem "resp" [ txt (resp <> " by") ]
                   , selem "orgName" [ txt orgName ] ]
 
+type TXMText a = (Has Contrastive a, Has PrimaryKey a)
+
 text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree
 text unitName txmText body =
   mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ]
   where
     metadata =
       [ "uid" .= uid txmText
-      , "book" .= show (book txmText)
-      , "author" .= list (authors txmText)
-      , "domains" .= list (domains txmText) ]
+      , "book" .= (show . book $ get txmText)
+      , "author" .= (unpack . formatList . authors $ get txmText)
+      , "domains" .= (unpack . formatList . domains $ get txmText) ]
 
 (.=) :: ArrowXml a => String -> String -> a b XmlTree
 key .= value = attr key (txt value)
diff --git a/manifest.scm b/manifest.scm
index af89124..6c52a47 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -10,13 +10,17 @@
              ((gnu packages xml) #:select (python-lxml)))
 
 (define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
+(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
+(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
 
 (packages->manifest
   (list
     coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
+    edda-clinic ; fix and cut the EDdA
     gcc-toolchain ; running haskell
     ghc ; running haskell
     ghc-cassava ; working with CSV in haskell
+    ghc-geode ; handling corpus files
     ghc-hs-conllu ; working on syntax-annotated documents
     ghc-hxt ; working on xml documents
     python ; scripts
diff --git a/scripts/EDdA/extract-from-source.sh b/scripts/EDdA/extract-from-source.sh
new file mode 100755
index 0000000..d3bd4f6
--- /dev/null
+++ b/scripts/EDdA/extract-from-source.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+source ${0%/*}/../lib.sh
+
+if [ "$#" != 2 ]
+then
+	die "${0##*/} SOURCE_DIRECTORY TARGET_DIRECTORY"
+else
+	SOURCE="${1}"
+	TARGET="${2}"
+	[ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing 1 .tei file per tome)"
+	[ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)"
+fi
+
+for T in {1..17}
+do
+	cleanEDdA < "${SOURCE}/volume$(printf "%02d" $T).tei" \
+		| fixEDdA \
+		| splitArticles \
+				--tome ${T} \
+				--xpath "/TEI/text/body/div1" \
+				--xml-root "${TARGET}/TEI" \
+				--text-root "${TARGET}/Text" \
+				--metadata
+done
diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs
index 705f39a..e4bdfea 100755
--- a/scripts/conllu-to-lexicoscope.hs
+++ b/scripts/conllu-to-lexicoscope.hs
@@ -1,5 +1,5 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
-{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}
 
 import Conllu.Parse (parseConllu)
 import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
@@ -8,8 +8,8 @@ import Conllu.Type (Doc, Sent(..))
 import Control.Monad.IO.Class (MonadIO(..))
 import Control.Monad.State (StateT(..), evalStateT)
 import Data.List as List (intercalate)
-import Data.Map (toList)
-import Data.Metadata (Article(..), Authors(..), Domains(..), Tome, byTome, list)
+--import Data.Map (toList)
+import GEODE.Metadata (Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
 import Data.Text as Text (unpack)
 import System.Environment (getArgs)
 import System.Exit (die)
@@ -21,6 +21,8 @@ import Text.XML.HXT.Core
 import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
 import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
 
+type Article = PrimaryKey @ Entry @ Contrastive
+
 type Serial = (Int, Int)
 
 next :: Selector Serial Int -> IOStateArrow Serial a String
@@ -35,19 +37,18 @@ pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
 sId :: Selector Serial Int
 sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
 
-to :: FilePath -> FilePath -> (Int, Tome) -> StateT Serial IO ()
-to source target (tomeNumber, tome) =
-  mapM_ (processArticle (source </> tomeDir) (target </> tomeDir)) tome
-  where
-    tomeDir = "T" <> show tomeNumber
+to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
+to source target (_, tome) =
+  mapM_ (processArticle source target) tome
 
 processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
-processArticle source target a@(Article {uid, rank}) =
+processArticle source target article =
+--processArticle source target a@(Article {rank}) =
   liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input)
-    >>= either (liftIO . die) (format a output) . parseConllu input
+    >>= either (liftIO . die) (format article output) . parseConllu input
   where
-    input = source </> Text.unpack uid <.> "conllu"
-    output = target </> Text.unpack uid <.> "xml"
+    input = source </> relativePath article "conllu"
+    output = target </> uid article <.> ".fr.xml"
     --input = source </> "article" ++ show rank <.> "conllu"
     --output = target </> "article" ++ show rank <.> "xml"
 
@@ -64,16 +65,17 @@ format article outputPath dom = StateT $ \s -> do
                 ,selem "text" (formatParagraph <$> byParagraph dom)]]]
 
 metaFrom :: ArrowXml a => Article -> a n XmlTree
-metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $
+metaFrom article = txt $
+--metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $
   concatMap (++ "\n")
     (List.intercalate "\t" <$>
       [[]
-      ,["fileName", Text.unpack uid]
-      ,["tome", show tome]
-      ,["rank", show rank]
-      ,["head", Text.unpack headWord]
-      ,["author", Text.unpack . list $ getAuthors authors]
-      ,["domain", Text.unpack . list $ getDomains domains]])
+      ,["fileName", uid article]
+      ,["tome", show . tome $ get article]
+      ,["rank", show . rank $ get article]
+      ,["head", Text.unpack . headWord $ get article]
+      ,["author", Text.unpack . formatList . authors $ get article]
+      ,["domain", Text.unpack . formatList . domains $ get article]])
 
 formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree
 formatParagraph (Paragraph sents) =
@@ -89,7 +91,8 @@ main :: IO ()
 main = getArgs >>= cli
   where
     withSerial = (`evalStateT` (0, 0))
+    byTome = groupBy (tome.get) . sortBy uid
     cli [sourceMeta, sourceRoot, targetRoot] =
-        byTome sourceMeta
-      >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . toList)
+        readNamedTsv sourceMeta
+      >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . byTome)
     cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/conllu-to-txm.hs b/scripts/conllu-to-txm.hs
index 8487819..69266da 100755
--- a/scripts/conllu-to-txm.hs
+++ b/scripts/conllu-to-txm.hs
@@ -1,18 +1,17 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
-{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}
 
 import Conllu.Parse (parseConllu)
 import Conllu.Type (AW, CW(..), ID(..), Sent(..))
 import Control.Applicative ((<|>), liftA2)
-import Data.Metadata
-  ( HasAuthors(..), Book, FromBook(..), relativePath, uid, groupBy, readTsv
-  , sortBy )
-import Data.Metadata.Article (Article(..))
-import Data.Text as Text (unpack)
+import GEODE.Metadata
+  ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..)
+  , type(@), relativePath, groupBy, readNamedTsv, sortBy )
+import Data.Text as Text (breakOn, drop, pack, unpack, splitOn)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.FilePath ((</>), (<.>))
-import System.Script (syntax)
+import System.Script (syntax, try)
 import Text.TEI
   ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
 import Text.XML.HXT.Core
@@ -20,9 +19,11 @@ import Text.XML.HXT.Core
   , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent
   , writeDocument )
 
+type Article = PrimaryKey @ Entry @ Contrastive
+
 to :: FilePath -> FilePath -> (Book, [Article]) -> IO ()
 to source target (theBook, metadata) =
-  mapM_ runArrow . groupBy tome $ sortBy rank metadata
+  mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
   where
     runArrow (tomeNumber, tomeArticles) =
       runX $ compileTome source target (theBook, tomeNumber, tomeArticles)
@@ -33,42 +34,35 @@ compileTome source target (theBook, tomeNumber, metadata) =
   tomeXml >>> writeDocument [withIndent True] output
   where
     strTome = show tomeNumber
-    output = target </> (show theBook) </> "T" <> strTome <.> "xml"
+    output = target </> (show theBook) <> "_T" <> strTome <.> "xml"
     tomeXml = selem "/"
                 [ selem "teiCorpus"
                   [ corpusHeader theBook strTome
                   , (articleFrom source $< constL metadata) ] ]
 
 articleFrom :: FilePath -> Article -> IOSArrow b XmlTree
-articleFrom source article@(Article {headWord}) =
+articleFrom source article =
   selem "TEI"
     [ teiHeader
-        (selem "titleStmt" [title $ Text.unpack headWord])
+        (selem "titleStmt" [title . Text.unpack . headWord.get $ article])
         (publicationStmt
           [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ])
         (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
     , text "article" article (loadConllu input) ]
   where
-    input = source </> relativePath article <.> "conllu"
-    authorArrow = constL (Text.unpack <$> authors article)
-
-{-
-metadataFrom :: Article -> [ IOSArrow b XmlTree ]
-metadataFrom article@(Article {book, authors, domains}) =
-  [ "id" .= uid article
-  , "book" .= show book
-  , "author" .= list (getAuthors authors)
-  , "domains" .= list (getDomains domains) ]
--}
+    input = source </> relativePath article "conllu"
+    authorArrow = constL (Text.unpack <$> (getList . authors $ get article))
 
 loadConllu :: FilePath -> IOSArrow b XmlTree
-loadConllu input = arrIO0 (parseConllu input <$> readFile input)
-                  >>> ((arrIO die) ||| format)
+loadConllu input =
+  arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
   where
+    debug msg = die $ "In file " <> input <> "\n" <> msg
     format = selem "body" [ unlistA >>> formatSentence ]
     formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ]
     formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma)
-                            , attr "pos" (txt $< getMaybe pos) ]
+                            , attr "pos" (txt $< getMaybe pos)
+                            , attr "type" (txt $< getMaybe misc) ]
                             [ txt $< getMaybe _form ]
     pos = liftA2 (<|>) spos _xpos
 
@@ -87,11 +81,15 @@ render (w:ws) = w:(render ws)
 spos :: CW a -> Maybe String
 spos = fmap show . _upos
 
+misc :: CW a -> Maybe String
+misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack)
+  where
+    getNer = lookup "ner" . fmap (fmap (Text.drop 1) . breakOn "=") . splitOn "|"
+
 main :: IO ()
 main = getArgs >>= cli
   where
-    books = groupBy book . sortBy uid
+    books = groupBy (book.get)
     cli [inputMeta, sourceRoot, targetRoot] =
-        readTsv inputMeta
-      >>= either die (mapM_ (sourceRoot `to` targetRoot) . books)
+      try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books
     cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/extract-corpus.sh b/scripts/extract-corpus.sh
index 4806dd1..5cc0f48 100755
--- a/scripts/extract-corpus.sh
+++ b/scripts/extract-corpus.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 INPUT_PATH="${1}"
 SOURCE_TEXT_ARTICLES="${2}"
diff --git a/scripts/extract-first-word.hs b/scripts/extract-first-word.hs
new file mode 100644
index 0000000..d41da32
--- /dev/null
+++ b/scripts/extract-first-word.hs
@@ -0,0 +1,175 @@
+{-# LANGUAGE OverloadedStrings #-}
+import Control.Applicative ((<|>), many, optional)
+import Data.Attoparsec.Text as Atto
+  ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
+  , takeWhile )
+import Data.ByteString.Lazy as BS (getContents)
+import Data.Char (isUpper, toLower, toUpper)
+import Data.Csv (FromRecord(..), HasHeader(..), decode)
+--import Data.Csv (HasHeader(..), decode)
+import Data.Text as Text (Text, pack)
+import Data.Text.IO as Text (readFile)
+--import Data.Text.IO as Text (getContents, readFile)
+import System.Script (syntax, warn)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>))
+import Text.Printf (printf)
+
+parenthesis :: Parser Text
+parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
+
+qualification :: Parser Text
+qualification =
+  qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
+  where
+    qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
+    vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
+    ucFirst (c:cs) = toUpper c : cs
+    ucFirst s = s
+
+eddaFirstWord :: String -> Parser String
+eddaFirstWord headWord =
+  diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
+  where
+    diderotStar = optional (char '*')
+    pulp = Atto.takeWhile (inClass ",. \t\n\r")
+    form = string (pack headWord) <|> Atto.takeWhile isUpper
+    meta = pulp `sepBy` (abbreviation <|> parenthesis)
+    abbreviation = pack <$> many1 letter <* char '.'
+    --gram = choice [noun, adj] <* char '.'
+    --noun = string "s." <* optional space <* choice [char 'm', char 'f']
+    --adj = string "adj"
+
+lgeFirstWord :: String -> Parser String
+lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
+  where
+    form = string (pack headWord)
+    pulp = Atto.takeWhile (inClass ",. \t\n\r")
+    meta = many (parenthesis *> pulp)
+
+type Triple = (Int, Int, String)
+newtype EDdA = EDdA { eddaTriple :: Triple }
+newtype LGE = LGE { lgeTriple :: Triple }
+
+eddaLine :: (Int, Int, String, String, String, String) -> EDdA
+eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord)
+
+lgeLine :: (String, Int, Int, String, String) -> LGE
+lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord)
+
+instance FromRecord EDdA where
+  parseRecord = fmap eddaLine . parseRecord
+
+instance FromRecord LGE where
+  parseRecord = fmap lgeLine . parseRecord
+
+data BookModule a =
+  BookModule
+    { getTriple :: a -> Triple
+    , relativePath :: Int -> Int -> FilePath
+    , firstWordParser :: String -> Atto.Parser String }
+
+edda :: BookModule EDdA
+edda = BookModule
+         { getTriple = eddaTriple
+         , relativePath = printf "T%d/article%d.txt"
+         , firstWordParser = eddaFirstWord }
+
+lge :: BookModule LGE
+lge = BookModule
+         { getTriple = lgeTriple
+         , relativePath = printf "T%d/ByRank/%d.txt"
+         , firstWordParser = lgeFirstWord }
+
+extractFrom :: FromRecord a => FilePath -> BookModule a -> IO ()
+extractFrom rootDirectory book =
+  decode HasHeader <$> BS.getContents
+  >>= either die (mapM_ (wordFromTriple . getTriple book))
+  where
+    wordFromTriple (tome, name, headWord) =
+      Text.readFile (rootDirectory </> relativePath book tome name)
+      >>= pure . parseOnly (firstWordParser book headWord)
+      >>= either
+            (\_ -> warn $ printf "%d\t%d" tome name)
+            (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    normalize = fmap toLower
+    run [book, rootDirectory]
+      | normalize book == "edda" = extractFrom rootDirectory edda
+      | normalize book == "edda" = extractFrom rootDirectory edda
+      | normalize book == "lge" = extractFrom rootDirectory lge
+    run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"
+
+{-
+
+parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO ()
+parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) =
+  Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
+  >>= pure . parseOnly (eddaFirstWord headWord)
+  >>= either
+        (\_ -> warn $ printf "%d\t%d" tome name)
+        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
+
+parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO ()
+parseLGEArticle rootDirectory (_,tome, name, headWord, _) =
+  Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
+  >>= pure . parseOnly (lgeFirstWord headWord)
+  >>= either
+        (\_ -> warn $ printf "%d\t%d" tome name)
+        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
+
+extractFirstWord :: FilePath -> (String -> Parser String)
+  -> (String, Int, Int, String, String) -> IO ()
+  -- -> (Int, Int, String, String, String, String) -> IO ()
+extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
+--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
+  Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
+  --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
+  >>= pure . parseOnly (parser headWord)
+  >>= either
+        (\_ -> warn $ printf "%d\t%d" tome name)
+        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
+
+data Book = EDdA | LGE
+
+data Triple a where
+  EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String)
+  LGE :: (String, Int, String, String, String) -> Triple (Int, String, String)
+
+class Triple a where
+  getTriple :: a -> (Int, String, String)
+
+instance Triple EDdA (Int, String, String, String, String, String) where
+  getTriple 
+
+getTriple :: Triple a -> a
+getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
+getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
+
+data BookModule =
+  BookModule
+    { getRecord :: Triple (Int, String, String)
+    , relativePath :: (Int, String) -> FilePath
+    , firstWordParser :: Atto.Parser Text }
+
+eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String)
+eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord)
+
+eddaPath
+
+lgeLine :: (String, Int, String, String, String) -> (Int, String, String)
+lgeLine (_,tome, name, headWord, _) = (tome, name, headWord)
+
+data Format =
+  EDdA (Int, String, String, String, String, String) 
+  | LGE (String, Int, String, String, String)
+
+getTriple :: Format -> Triple
+getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
+getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
+
+-}
diff --git a/scripts/extract-first-word_lge-mode-backup.hs b/scripts/extract-first-word_lge-mode-backup.hs
new file mode 100644
index 0000000..76d3c79
--- /dev/null
+++ b/scripts/extract-first-word_lge-mode-backup.hs
@@ -0,0 +1,76 @@
+{-# LANGUAGE OverloadedStrings #-}
+import Control.Applicative ((<|>), many, optional)
+import Data.Attoparsec.Text as Atto
+  ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
+  , takeWhile )
+import Data.ByteString.Lazy as BS (getContents)
+import Data.Char (isUpper, toLower, toUpper)
+import Data.Csv (HasHeader(..), decode)
+import Data.Text as Text (Text, pack)
+import Data.Text.IO as Text (getContents, readFile)
+import System.Script (syntax, warn)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>))
+import Text.Printf (printf)
+
+eddaFirstWord :: String -> Parser String
+eddaFirstWord headWord =
+  diderotStar *> many space *> head *> meta *> optional qualification *> many1 letter
+  where
+    diderotStar = optional (char '*')
+    pulp = Atto.takeWhile (inClass ",. \t\n\r")
+    head = string (pack headWord) <|> Atto.takeWhile isUpper --choice [Atto.takeWhile isUpper, string (pack headWord)]
+    meta = pulp `sepBy` (abbreviation <|> parenthesis)
+    --meta = pulp `sepBy` (choice [abbreviation, parenthesis])
+    --head = string (pack headWord) <* pulp `sepBy` (choice [abbreviation, parenthesis])
+    abbreviation = pack <$> many1 letter <* char '.'
+    --head = string (pack headWord) <* pulp `sepBy` (choice [gram, parenthesis])
+    gram = choice [noun, adj] <* char '.'
+    noun = string "s." <* optional space <* choice [char 'm', char 'f']
+    adj = string "adj"
+
+parenthesis :: Parser Text
+parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
+
+qualification :: Parser Text
+qualification =
+  qualifier <* optional (char 'e') <* optional (char 's') <* many space
+  where
+    qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
+    vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
+    ucFirst (c:cs) = toUpper c : cs
+
+lgeFirstWord :: String -> Parser String
+lgeFirstWord headWord = head *> pulp *> meta *> optional qualification *> pulp *> many1 letter
+  where
+    head = string (pack headWord)
+    pulp = Atto.takeWhile (inClass ",. \t\n\r")
+    meta = many (parenthesis *> pulp)
+
+extractFirstWord :: FilePath -> (String -> Parser String)
+  -> (String, Int, Int, String, String) -> IO ()
+  -- -> (Int, Int, String, String, String, String) -> IO ()
+extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
+--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
+  Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
+  --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
+  >>= pure . parseOnly (parser headWord)
+  >>= either
+        (\_ -> warn $ printf "%d\t%d" tome name)
+        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
+
+run :: [String] -> IO ()
+run [book, rootDirectory]
+  | normalizedBook == "edda" = apply eddaFirstWord
+  | normalizedBook == "lge" = apply lgeFirstWord
+  where
+    normalizedBook = toLower <$> book
+    apply parser =
+      BS.getContents
+      >>= either die (mapM_ $ extractFirstWord rootDirectory parser) . decode HasHeader
+
+run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"
+
+main :: IO ()
+main = getArgs >>= run
diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh
new file mode 100755
index 0000000..c99cc55
--- /dev/null
+++ b/scripts/extract-from-source.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+BASE_DIR="${0%/*}"
+
+source ${BASE_DIR}/lib.sh
+
+if [ "$#" != 2 ]
+then
+	die "${0##*/} SOURCE_DIRECTORY TARGET_DIRECTORY"
+else
+	SOURCE="${1}"
+	TARGET="${2}"
+	[ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing 1 .tei file per tome)"
+	[ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)"
+fi
+
+FILES_TSV="${TARGET}/files.tsv"
+printf "book	tome	rank	headWord	name	page\n" > "${FILES_TSV}"
+${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}"
+#${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"
diff --git a/scripts/lib.sh b/scripts/lib.sh
new file mode 100644
index 0000000..94a0897
--- /dev/null
+++ b/scripts/lib.sh
@@ -0,0 +1,4 @@
+function die() {
+	printf "${1}\n"
+	exit 1
+}
diff --git a/scripts/merge.hs b/scripts/merge.hs
new file mode 100755
index 0000000..b0843d3
--- /dev/null
+++ b/scripts/merge.hs
@@ -0,0 +1,28 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
+
+import Control.Monad.Except (ExceptT(..), runExceptT)
+import Data.Map ((!?))
+import Data.Maybe (catMaybes)
+import Data.Vector (Vector, toList)
+import GEODE.Metadata (Contrastive, Entry, Has(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.Script (syntax)
+
+merge :: Vector (PrimaryKey @ Entry) -> Vector (PrimaryKey @ Contrastive) -> [PrimaryKey @ Entry @ Contrastive]
+merge left right = catMaybes (mergeRow <$> toList right)
+  where
+    indexed = get <$> indexBy get left
+    mergeRow (pK :@: contrastive) =
+      (\entry -> (pK :@: entry :@: contrastive)) <$> (indexed !? pK)
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [left, right, output] =
+      runExceptT (merge
+                    <$> ExceptT (readNamedTsv left)
+                    <*> ExceptT (readNamedTsv right) )
+      >>= either die (tsvFile output)
+    run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv"
diff --git a/scripts/parallel-links.hs b/scripts/parallel-links.hs
new file mode 100755
index 0000000..3243c5a
--- /dev/null
+++ b/scripts/parallel-links.hs
@@ -0,0 +1,36 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE ExplicitNamespaces #-}
+module Main where
+
+import Data.Char as Text (toLower)
+import Data.Text as Text (map)
+import GEODE.Metadata
+  ( Entry(..), Has(..), PrimaryKey(..), type(@), groupBy, readNamedTsv, tsvFile )
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.Script (syntax)
+
+type Line = PrimaryKey @ Entry
+
+findPairs :: Foldable t => Bool -> t Line -> [Line]
+findPairs caseInsensitive =
+  concatMap snd . filter isPair . groupBy (normalize . headWord . get)
+  where
+    isPair = oneInEach . groupBy (book.get) . snd
+    oneInEach = (&&) <$> bothBooks <*> oneByBook
+    bothBooks = ((2 ==) . length)
+    oneByBook = all ((1 ==) . length . snd)
+    normalize
+      | caseInsensitive = Text.map toLower
+      | otherwise = id
+
+main :: IO ()
+main = getArgs >>= popCaseInsensitive run
+  where
+    run caseInsensitive [input, output] = readNamedTsv input
+      >>= either
+            die
+            (tsvFile output . findPairs caseInsensitive)
+    run _ _ = syntax "[-i] SOURCE_METADATA.tsv TARGET_METADATA.tsv"
+    popCaseInsensitive f ("-i":args) = f True args
+    popCaseInsensitive f args = f False args
diff --git a/scripts/parallel-links.py b/scripts/parallel-links.py
index 284dc5d..605b12b 100755
--- a/scripts/parallel-links.py
+++ b/scripts/parallel-links.py
@@ -27,12 +27,14 @@ def unserializeMetadata(path, rowReader):
             if header:
                 header = False
             else:
-                yield rowReader(row)
+                yield {
+                        'book': row[0]
+                        }
 
-def concat(generators):
-    for g in generators:
-        for x in g:
-            yield x
+#def concat(generators):
+#    for g in generators:
+#        for x in g:
+#            yield x
 
 def naiveIndexBy(field, elements):
     d = {}
@@ -44,63 +46,6 @@ def naiveIndexBy(field, elements):
             d[key] = e
     return d
 
-def growPrefixes(d, keys, maxLength):
-    for length in range(maxLength, 1, -1):
-        newGeneration = {}
-        for key in keys:
-            if len(key) == length:
-                newKey = key[:-1]
-                if newKey in newGeneration or newKey in d:
-                    newGeneration[newKey] = None
-                else:
-                    newGeneration[newKey] = d[key]
-        for key, value in newGeneration.items():
-            if value is not None:
-                d[key] = value
-                keys.add(key)
-            elif key not in d:
-                d[key] = value
-
-def indexBy(field, elements, prefix=True):
-    d = {}
-    for e in elements:
-        key = e[field]
-        if key in d:
-            d[key] = None
-        else:
-            d[key] = e
-    if prefix:
-        keys = set(d.keys())
-        growPrefixes(d, keys, max(map(len, keys)))
-    return d
-
-def headWords(head):
-    words = head.split()
-    if len(words) == 1:
-        return words
-    else:
-        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
-
-def identify(head, haystack):
-    if head in haystack:
-        if haystack[head] is not None:
-            return {'type': 'exact', 'match': head, 'found': haystack[head]}
-        else:
-            return None
-    else:
-        prefix = head[:-1]
-        while len(prefix) > 0 and prefix not in haystack:
-            prefix = prefix[:-1]
-        if prefix in haystack and haystack[prefix] is not None:
-            return {
-                    'type': 'prefix',
-                    'match': head,
-                    'found': haystack[prefix],
-                    'precision': len(prefix) / len(head)
-                    }
-        else:
-            return None
-
 def naiveGetArrows(source, target):
     indexedSource = naiveIndexBy('head', source)
     indexedTarget = naiveIndexBy('head', target)
@@ -111,28 +56,85 @@ def naiveGetArrows(source, target):
                     'target': indexedTarget[head]
                     }
 
-def getArrows(source, target):
-    for article in source:
-        heads = headWords(article['head'])
-        identified = map(lambda w: identify(w, target), heads)
-        entries = [e for e in identified if e is not None]
-        if len(entries) == 1:
-            yield {
-                    'type': 'match',
-                    'source': article,
-                    'target': entries[0]
-                    }
-        elif len(entries) > 1:
-            yield {
-                    'type': 'ambiguity',
-                    'source': article,
-                    'target': entries
-                    }
-
-def interesting(arrow):
-    if arrow['type'] == 'match':
-        target = arrow['target']
-        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
+#def growPrefixes(d, keys, maxLength):
+#    for length in range(maxLength, 1, -1):
+#        newGeneration = {}
+#        for key in keys:
+#            if len(key) == length:
+#                newKey = key[:-1]
+#                if newKey in newGeneration or newKey in d:
+#                    newGeneration[newKey] = None
+#                else:
+#                    newGeneration[newKey] = d[key]
+#        for key, value in newGeneration.items():
+#            if value is not None:
+#                d[key] = value
+#                keys.add(key)
+#            elif key not in d:
+#                d[key] = value
+
+#def indexBy(field, elements, prefix=True):
+#    d = {}
+#    for e in elements:
+#        key = e[field]
+#        if key in d:
+#            d[key] = None
+#        else:
+#            d[key] = e
+#    if prefix:
+#        keys = set(d.keys())
+#        growPrefixes(d, keys, max(map(len, keys)))
+#    return d
+
+#def headWords(head):
+#    words = head.split()
+#    if len(words) == 1:
+#        return words
+#    else:
+#        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
+
+#def identify(head, haystack):
+#    if head in haystack:
+#        if haystack[head] is not None:
+#            return {'type': 'exact', 'match': head, 'found': haystack[head]}
+#        else:
+#            return None
+#    else:
+#        prefix = head[:-1]
+#        while len(prefix) > 0 and prefix not in haystack:
+#            prefix = prefix[:-1]
+#        if prefix in haystack and haystack[prefix] is not None:
+#            return {
+#                    'type': 'prefix',
+#                    'match': head,
+#                    'found': haystack[prefix],
+#                    'precision': len(prefix) / len(head)
+#                    }
+#        else:
+#            return None
+#
+#def getArrows(source, target):
+#    for article in source:
+#        heads = headWords(article['head'])
+#        identified = map(lambda w: identify(w, target), heads)
+#        entries = [e for e in identified if e is not None]
+#        if len(entries) == 1:
+#            yield {
+#                    'type': 'match',
+#                    'source': article,
+#                    'target': entries[0]
+#                    }
+#        elif len(entries) > 1:
+#            yield {
+#                    'type': 'ambiguity',
+#                    'source': article,
+#                    'target': entries
+#                    }
+
+#def interesting(arrow):
+#    if arrow['type'] == 'match':
+#        target = arrow['target']
+#        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
 
 #gold = [a for a in arrows if interesting(a)]
 
diff --git a/scripts/select.hs b/scripts/select.hs
new file mode 100755
index 0000000..ba1c5f3
--- /dev/null
+++ b/scripts/select.hs
@@ -0,0 +1,27 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
+module Main where
+
+import Control.Monad.Except (ExceptT(..), runExceptT)
+import Data.Map ((!?))
+import Data.Maybe (catMaybes)
+import Data.Vector (Vector, toList)
+import GEODE.Metadata (Entry, Has(..), PrimaryKey, type(@), indexBy, readNamedTsv, tsvFile)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.Script (syntax)
+
+getMeta :: Vector PrimaryKey -> Vector (PrimaryKey @ Entry) -> [(PrimaryKey @ Entry)]
+getMeta coords meta = catMaybes ((indexed !?) <$> toList coords)
+  where
+    indexed = indexBy get meta
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [keys, input, output] =
+      runExceptT ( getMeta
+                    <$> ExceptT (readNamedTsv keys)
+                    <*> ExceptT (readNamedTsv input) )
+      >>= either die (tsvFile output)
+    run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv"
diff --git a/scripts/subcorpus/getFiles.hs b/scripts/subcorpus/getFiles.hs
new file mode 100755
index 0000000..ca99a95
--- /dev/null
+++ b/scripts/subcorpus/getFiles.hs
@@ -0,0 +1,29 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+
+import GEODE.Metadata (PrimaryKey(..), readNamedTsv, relativePath)
+import System.Directory (createDirectoryIfMissing, doesFileExist)
+import System.Environment (getArgs)
+import System.FilePath ((</>), takeDirectory)
+import System.Posix.Files (createLink)
+import System.Script (syntax, try)
+
+move :: String -> FilePath -> FilePath -> PrimaryKey -> IO ()
+move extension sourceRoot targetRoot article = do
+  fileExists <- doesFileExist sourceFile
+  if fileExists
+  then do
+    createDirectoryIfMissing True (takeDirectory targetFile)
+    createLink sourceFile targetFile
+  else
+    putStrLn sourceFile
+  where
+    fileName = relativePath article extension
+    sourceFile = sourceRoot </> fileName
+    targetFile = targetRoot </> fileName
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [files, extension, source, target] =
+      try (readNamedTsv files) >>= mapM_ (move extension source target)
+    run _ = syntax "FILES.tsv EXTENSION SOURCE_DIRECTORY TARGET_DIRECTORY"
diff --git a/scripts/subcorpus/getTXMQuery.hs b/scripts/subcorpus/getTXMQuery.hs
new file mode 100755
index 0000000..854bf6a
--- /dev/null
+++ b/scripts/subcorpus/getTXMQuery.hs
@@ -0,0 +1,21 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE OverloadedStrings #-}
+
+import Data.Text as Text (Text, pack, intercalate)
+import Data.Text.IO as Text (putStr)
+import GEODE.Metadata (PrimaryKey, uid, readNamedTsv)
+import System.Environment (getArgs)
+import System.Script (syntax, try)
+import Text.Printf (printf)
+
+txmQuery :: Foldable t => t PrimaryKey -> Text
+txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get)
+  where
+    select = Text.pack . printf "a.text_uid=\"%s\"" . uid
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [files] =
+      try (readNamedTsv files) >>= Text.putStr . txmQuery
+    run _ = syntax "FILES.tsv"
-- 
GitLab