From 9ab20ad22e412b8c2b05debe78ba016b302b1065 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Fri, 26 Jan 2024 17:28:28 +0100
Subject: [PATCH] Keep haskell codebase up to date with lib ghc-geode

---
 lib/haskell/Conllu/Tree.hs                    |   9 +-
 lib/haskell/GEODE/Metadata/ParagraphRecord.hs |  16 +-
 lib/haskell/GEODE/Metadata/ProdigyMeta.hs     |  11 +-
 lib/haskell/Text/TEI.hs                       |  10 +-
 scripts/conllu-to-lexicoscope.hs              | 161 ------------------
 scripts/fix-tsv.hs                            |  10 +-
 scripts/merge.hs                              |  46 ++---
 scripts/paragraphs.hs                         |  17 +-
 scripts/parallel-links.hs                     |  20 ++-
 scripts/sample.hs                             |  10 +-
 scripts/subcorpus/get-files.hs                |  20 +--
 .../{computeProfile.hs => compute-profile.hs} |  27 ++-
 scripts/textometry/conllu-to-lexicoscope.hs   | 111 ++++++++++++
 scripts/{ => textometry}/conllu-to-txm.hs     |  50 +++---
 .../textometry/{topNOUN.hs => top-noun.hs}    |  15 +-
 15 files changed, 243 insertions(+), 290 deletions(-)
 delete mode 100755 scripts/conllu-to-lexicoscope.hs
 rename scripts/textometry/{computeProfile.hs => compute-profile.hs} (80%)
 create mode 100755 scripts/textometry/conllu-to-lexicoscope.hs
 rename scripts/{ => textometry}/conllu-to-txm.hs (67%)
 rename scripts/textometry/{topNOUN.hs => top-noun.hs} (86%)

diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs
index 9969117..8d5967b 100644
--- a/lib/haskell/Conllu/Tree.hs
+++ b/lib/haskell/Conllu/Tree.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE DeriveGeneric, OverloadedStrings, TypeFamilies #-}
+{-# LANGUAGE DeriveGeneric, OverloadedLists, OverloadedStrings, TypeFamilies #-}
 module Conllu.Tree
   ( EP(..)
   , Feat(..)
@@ -17,13 +17,12 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP
 import qualified Conllu.UposTagset as Conllu (POS)
 import qualified Conllu.DeprelTagset as Conllu (EP)
 import Data.ByteString.Char8 as ByteString (pack)
-import Data.Csv ((.=), ToField(..), ToNamedRecord(..), namedRecord)
+import Data.Csv ((.=), DefaultOrdered(..), ToField(..), ToNamedRecord(..), namedRecord)
 import Data.Int (Int8)
 import Data.List (intercalate, partition)
 import Data.Map as Map (Map, empty, insert, toList)
 import Data.Serialize (Serialize(..))
 import Data.Tree (Forest, Tree(..))
-import GEODE.Metadata (DefaultFields(..), HasDefaultHeader(..))
 import GHC.Generics (Generic(..), K1(..), Rec0)
 
 data ID =
@@ -120,8 +119,8 @@ instance ToNamedRecord IndexedWord where
         show ep : maybe [] id ((:) <$> _subdep <*> _rest)
       showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel)
 
-instance HasDefaultHeader IndexedWord where
-  defaultFields = DefaultFields
+instance DefaultOrdered IndexedWord where
+  headerOrder _ =
     [ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
     , "misc" ]
 
diff --git a/lib/haskell/GEODE/Metadata/ParagraphRecord.hs b/lib/haskell/GEODE/Metadata/ParagraphRecord.hs
index f37aaf0..57519e9 100644
--- a/lib/haskell/GEODE/Metadata/ParagraphRecord.hs
+++ b/lib/haskell/GEODE/Metadata/ParagraphRecord.hs
@@ -6,19 +6,22 @@ module GEODE.Metadata.ParagraphRecord
 import Data.Aeson ((.=), FromJSON(..), ToJSON(..))
 import Data.Aeson.KeyMap as KeyMap (singleton)
 import Data.Csv
-  ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
+  ( (.:), DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
 import GEODE.Metadata as Article
-  ( type (@)(..), ArticleRecord, DefaultFields(..), HasDefaultHeader(..)
-  , Record(..), ToJSONObject(..), uid, relativePath)
+  ( type (@)(..), ArticleRecord, Record(..), ToJSONObject(..))
 import GHC.Generics (Generic)
 import System.FilePath ((<.>))
+import Text.Read (readEither)
 
-newtype Paragraph = Paragraph { paragraph :: Int } deriving (Generic, Show)
+newtype Paragraph = Paragraph { paragraph :: Int } deriving (Generic, Eq, Ord, Show)
 
+instance DefaultOrdered Paragraph
 instance FromJSON Paragraph
 
 instance Record Paragraph where
-  uid = show . paragraph
+  toUID = show . paragraph
+
+  fromUID = fmap Paragraph . readEither . takeWhile (/= '_')
 
   relativePath (Paragraph {paragraph}) extension = show paragraph <.> extension
 
@@ -35,6 +38,3 @@ instance ToJSONObject Paragraph where
   toJSONObject (Paragraph {paragraph}) =
     KeyMap.singleton "paragraph" (toJSON paragraph)
   toJSONPairs (Paragraph {paragraph}) = "paragraph" .= paragraph
-
-instance HasDefaultHeader Paragraph where
-  defaultFields = DefaultFields ["paragraph"]
diff --git a/lib/haskell/GEODE/Metadata/ProdigyMeta.hs b/lib/haskell/GEODE/Metadata/ProdigyMeta.hs
index edb3fdd..60ce511 100644
--- a/lib/haskell/GEODE/Metadata/ProdigyMeta.hs
+++ b/lib/haskell/GEODE/Metadata/ProdigyMeta.hs
@@ -7,9 +7,9 @@ module GEODE.Metadata.ProdigyMeta
 
 import Data.Aeson ((.=), FromJSON(..), ToJSON(..))
 import Data.Aeson.KeyMap as KeyMap (fromList)
-import Data.Csv (FromNamedRecord(..), ToNamedRecord(..))
+import Data.Csv (DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..))
 import Data.Text (Text)
-import GEODE.Metadata (type (@), DefaultFields(..), HasDefaultHeader(..), ToJSONObject(..))
+import GEODE.Metadata (type (@), ToJSONObject(..))
 import GEODE.Metadata.ParagraphRecord (ParagraphRecord)
 import GHC.Generics (Generic)
 
@@ -26,18 +26,15 @@ instance ToJSONObject ProdigyMeta where
       "totalParagraphs" .= totalParagraphs
     <> "headword" .= headword
 
+instance DefaultOrdered ProdigyMeta
 instance ToNamedRecord ProdigyMeta
 instance FromNamedRecord ProdigyMeta
 
-instance HasDefaultHeader ProdigyMeta where
-  defaultFields = DefaultFields ["totalParagraphs", "headword"]
-
 newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
 
+instance DefaultOrdered Classification
 instance ToNamedRecord Classification
 instance FromNamedRecord Classification
-instance HasDefaultHeader Classification where
-  defaultFields = DefaultFields ["paragraphFunction"]
 
 instance ToJSONObject Classification where
   toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList
diff --git a/lib/haskell/Text/TEI.hs b/lib/haskell/Text/TEI.hs
index 558a616..ef47dcd 100644
--- a/lib/haskell/Text/TEI.hs
+++ b/lib/haskell/Text/TEI.hs
@@ -9,10 +9,10 @@ module Text.TEI
 
 import Data.Text (unpack)
 import GEODE.Metadata
-  (Book(..), Contrastive(..), Has(..), PrimaryKey(..), formatList, uid)
+  (ArticleRecord(..), Work(..), Contrastive(..), Has(..), Record(..), formatList)
 import Text.XML.HXT.Core (ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt)
 
-corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree
+corpusHeader :: ArrowXml a => Work -> String -> a b XmlTree
 corpusHeader EDdA = eddaHeader
 corpusHeader LGE = lgeHeader
 corpusHeader Wikipedia = wikiHeader
@@ -85,15 +85,15 @@ by :: ArrowXml a => String -> String -> [a b XmlTree]
 by resp orgName = [ selem "resp" [ txt (resp <> " by") ]
                   , selem "orgName" [ txt orgName ] ]
 
-type TXMText a = (Has Contrastive a, Has PrimaryKey a)
+type TXMText a = (Has Contrastive a, Has ArticleRecord a)
 
 text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree
 text unitName txmText body =
   mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ]
   where
     metadata =
-      [ "uid" .= uid txmText
-      , "book" .= (show . book $ get txmText)
+      [ "uid" .= toUID (get txmText :: ArticleRecord)
+      , "work" .= (show . work $ get txmText)
       , "author" .= (unpack . formatList . authors $ get txmText)
       , "domains" .= (unpack . formatList . domains $ get txmText)
       , "subCorpus" .= (unpack . formatList . subCorpus $ get txmText) ]
diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs
deleted file mode 100755
index 1d2a76a..0000000
--- a/scripts/conllu-to-lexicoscope.hs
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
-{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}
-
-import Conllu.Parse (parseConllu)
-import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
-import Conllu.Print (printSent)
-import Conllu.Type (Sent(..))
---import Conllu.Type (Doc, Sent(..))
---import Control.Monad.IO.Class (MonadIO(..))
-import Control.Monad.State (StateT(..), evalStateT)
-import Data.List as List (intercalate)
---import Data.Map (toList)
-import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
-import Data.Text as Text (unpack)
-import System.Environment (getArgs)
-import System.Exit (die)
-import System.FilePath ((</>), (<.>))
-import System.Script (syntax, try)
-import Text.XML.HXT.Core
-  ((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem
-  , txt, withIndent, writeDocument)
-import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
-import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
-
-type Article = PrimaryKey @ Entry @ Contrastive
-
-type Serial = (Int, Int)
-type WithSerial a = StateT Serial IO a
-type SerialArrow n a = IOStateArrow Serial n a
---type SerialArrow a = IOStateArrow Serial () a
-
-next :: Selector Serial Int -> SerialArrow n String
-next intSelector = IOSLA getUpdate
-  where
-    getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
-    field = theUserState >>> intSelector
-
-pId :: Selector Serial Int
-pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
-
-{-
-sId :: Selector Serial Int
-sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
--}
-
-{-
-to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
-to source target (_, tome) =
-  mapM_ (processArticle source target) tome
--}
-
-stateOfArrow :: IOStateArrow s () a -> StateT s IO ()
-stateOfArrow a = StateT $ \s0 -> do
-  (s1, _) <- runIOSLA a (initialState s0) ()
-  pure ((), getS theUserState s1)
-
-to :: FilePath -> FilePath -> (Book, [Article]) -> WithSerial ()
-to source target (theBook, metadata) =
-  mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
-  where
-    runArrow (tomeNumber, tomeArticles) = stateOfArrow $
-      compileTome source target (theBook, tomeNumber, tomeArticles)
-
-compileTome ::
-  FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow n XmlTree
-compileTome source target (theBook, tomeNumber, metadata) =
-  tomeXml >>> writeDocument [withIndent True] output
-  where
-    strTome = show tomeNumber
-    output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml"
-    tomeXml = selem "/"
-                [ selem "teiCorpus"
-                --[ selem "corpus"
-                  [ articleFrom source $< constL metadata ] ]
-
-articleFrom :: FilePath -> Article -> SerialArrow n XmlTree
-articleFrom source article =
-  selem "doc"
-    [ selem "meta" [metaFrom article]
-    , selem "text" [loadConllu input] ]
-
-    --(formatParagraph <$> byParagraph dom) ]
-  where
-    input = source </> relativePath article "conllu"
-
-loadConllu :: FilePath -> SerialArrow n XmlTree
-loadConllu input =
-  arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
-  where
-    debug msg = die $ "In file " <> input <> "\n" <> msg
-    format = formatParagraph $< arrL byParagraph
-
-metaFrom :: ArrowXml a => Article -> a n XmlTree
-metaFrom article = txt $
-  concatMap (++ "\n")
-    (List.intercalate "\t" <$>
-      [[]
-      ,["fileName", uid article]
-      ,["tome", show . tome $ get article]
-      ,["rank", show . rank $ get article]
-      ,["head", Text.unpack . headWord $ get article]
-      ,["author", Text.unpack . formatList . authors $ get article]
-      ,["domains", Text.unpack . formatList . domains $ get article]
-      ,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get article ]
-      ,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get article ]])
-
-{-
-formatSentence :: IOStateArrow Serial Sent XmlTree
-formatSentence :: IOStateArrow Serial Sent XmlTree
-formatSentence =
-  selem "s" [txt sentences]
-  where
-    sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
--}
-
-formatParagraph :: Paragraph -> SerialArrow n XmlTree
---formatParagraph :: Paragraph -> SerialArrow XmlTree
-formatParagraph (Paragraph sents) =
-  mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
-
-formatSentence :: Sent -> SerialArrow n XmlTree
-formatSentence s =
-  selem "s" [txt sentences]
-  --mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
-  where
-    sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
-
-{-
-processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
-processArticle source target article =
---processArticle source target a@(Article {rank}) =
-  liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input)
-    >>= either (liftIO . die) (format article output) . parseConllu input
-  where
-    input = source </> relativePath article "conllu"
-    output = target </> uid article <.> ".fr.xml"
-    --input = source </> "article" ++ show rank <.> "conllu"
-    --output = target </> "article" ++ show rank <.> "xml"
-
-format :: Article -> FilePath -> Doc -> StateT Serial IO ()
-format _ _ [] = pure ()
-format article outputPath dom = StateT $ \s -> do
-  (newState, _) <- runIOSLA (xml >>> writeDocument [] outputPath) (initialState s) ()
-  pure ((), getS theUserState newState)
-  where
-    xml = selem "/"
-            [selem "corpus"
-              [selem "doc"
-                [selem "meta" [metaFrom article]
-                ,selem "text" (formatParagraph <$> byParagraph dom)]]]
--}
-
-main :: IO ()
-main = getArgs >>= cli
-  where
-    withSerial = (`evalStateT` (0, 0))
-    books = groupBy (book.get)
-    cli [sourceMeta, sourceRoot, targetRoot] =
-        try (readNamedTsv sourceMeta)
-      >>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . books)
-    cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/fix-tsv.hs b/scripts/fix-tsv.hs
index 749dbbd..a3d17ca 100755
--- a/scripts/fix-tsv.hs
+++ b/scripts/fix-tsv.hs
@@ -1,20 +1,20 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 {-# LANGUAGE OverloadedStrings #-}
 
 import Data.ByteString as BS (toStrict)
-import Data.Csv (EncodeOptions(..), encodeWith, defaultEncodeOptions)
+import Data.Csv (encodeWith)
 import Data.Text as Text (Text, splitOn)
 import Data.Text.Encoding as Text (decodeUtf8)
+import GEODE.Metadata (toTSV)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.Script (syntax)
 import Text.Filter (xargs)
 
 fixTsv :: [Text] -> IO Text
-fixTsv = fmap toTsv . mapM fixLine
+fixTsv = fmap encode . mapM fixLine
   where
-    toTsv = decodeUtf8 . BS.toStrict . encodeWith
-              (defaultEncodeOptions { encDelimiter = toEnum $ fromEnum '\t' })
+    encode = decodeUtf8 . BS.toStrict . encodeWith toTSV
     fixLine = escapeFormLemma . Text.splitOn "\t"
     escapeFormLemma [n, form, lemma, pos, ene] = pure (n, form, lemma, pos, ene)
     escapeFormLemma l = die $ show l
diff --git a/scripts/merge.hs b/scripts/merge.hs
index b42440c..75cec17 100755
--- a/scripts/merge.hs
+++ b/scripts/merge.hs
@@ -1,35 +1,41 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-}
 
-import Control.Monad.Except (ExceptT(..), runExceptT)
-import Data.Foldable (toList)
+import Control.Monad.Except (runExceptT)
+import Data.Csv (DefaultOrdered(..))
 import Data.Map (Map, (!?))
-import Data.Maybe (catMaybes)
 import Data.Text (Text, pack)
-import GEODE.Metadata (Contrastive(..), Entry, Has(..), MultiText(..), ArticleRecord, type(@)(..), indexBy, readNamedTsv, tsvFile)
+import Data.Vector (catMaybes)
+import GEODE.Metadata
+  ( type(@)(..), ArticleRecord, Contrastive(..), Document(..), Entry, Has(..)
+  , MultiText(..), ReadTSV(..), WriteTSV(..), for, getHeader, indexBy)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.Script (syntax)
 
-leftJoin :: forall a b c t. (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) =>
-  (a -> Maybe b -> c) -> t a -> t b -> t c
-leftJoin f left right = outputRow <$> left
+leftJoin :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) =>
+  (a -> Maybe b -> c) -> Document a -> Document b -> Document c
+leftJoin f left right = Document
+  { header = getHeader (for :: c)
+  , rows = outputRow <$> rows left }
   where
     indexed :: Map ArticleRecord b
-    indexed = indexBy get right
+    indexed = indexBy get (rows right)
     outputRow leftLine =
       f leftLine (indexed !? get leftLine)
 
-rightJoin :: (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) =>
-  (Maybe a -> b -> c) -> t a -> t b -> t c
+rightJoin :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) =>
+  (Maybe a -> b -> c) -> Document a -> Document b -> Document c
 rightJoin f left right = leftJoin (flip f) right left
 
-join :: forall a b c t. (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) =>
-  (a -> b -> c) -> t a -> t b -> [c]
-join f left right = catMaybes . toList $ outputRow <$> left
+join :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) =>
+  (a -> b -> c) -> Document a -> Document b -> Document c
+join f left right = Document
+  { header = getHeader (for :: c)
+  , rows = catMaybes $ outputRow <$> rows left }
   where
     indexed :: Map ArticleRecord b
-    indexed = indexBy get right
+    indexed = indexBy get (rows right)
     outputRow leftLine =
       f leftLine <$> (indexed !? get leftLine)
 
@@ -48,12 +54,12 @@ main = getArgs >>= run
   where
     run [left, right, output] =
       runExceptT ((join merge)
-                    <$> ExceptT (readNamedTsv left)
-                    <*> ExceptT (readNamedTsv right) )
-      >>= either die (tsvFile output)
+                    <$> (readTSV left)
+                    <*> (readTSV right) )
+      >>= either die (writeTSV output)
     run [name, left, right, output] =
       runExceptT ((rightJoin (tag $ pack name))
-                    <$> ExceptT (readNamedTsv left)
-                    <*> ExceptT (readNamedTsv right) )
-      >>= either die (tsvFile output)
+                    <$> (readTSV left)
+                    <*> (readTSV right) )
+      >>= either die (writeTSV output)
     run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv"
diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs
index 03597cd..08b3036 100755
--- a/scripts/paragraphs.hs
+++ b/scripts/paragraphs.hs
@@ -3,14 +3,16 @@
 import Data.List (dropWhileEnd)
 import Data.Text (Text, splitOn)
 import Data.Text.IO as Text (readFile, writeFile)
+import Data.Vector as Vector (Vector, fromList)
 import GEODE.Metadata as Article
-  ( type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, tsvFile )
+  ( type (@)(..), ArticleRecord, Document(..), Entry(..), ReadTSV(..)
+  , Record(..), WriteTSV(..), getHeader, for )
 import GEODE.Metadata.ParagraphRecord (Paragraph(..))
 import GEODE.Metadata.ProdigyMeta as Prodigy (ParagraphMeta, ProdigyMeta(..))
 import System.Directory (createDirectoryIfMissing)
 import System.Environment (getArgs)
 import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory)
-import System.Script (syntax, try)
+import System.Script (syntax)
 import Text.Filter (Editable(..))
 import Text.Filter.Linearize (linearize)
 
@@ -38,11 +40,16 @@ to source target meta@(articleRecord :@: _) = do
       let outputPath = target </> relativePath paragraphRecord "txt" in
       paragraphMeta <$ Text.writeFile outputPath paragraphText
 
+asDocument :: Vector [ParagraphMeta] -> Document ParagraphMeta
+asDocument batches = Document
+  { header = getHeader (for :: ParagraphMeta)
+  , rows = Vector.fromList $ concat batches }
+
 main :: IO ()
 main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run
   where
     run [inputMeta, source, target] =
-        try (readNamedTsv inputMeta)
-      >>= mapM (source `to` target)
-      >>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat
+        readTSV inputMeta
+      >>= mapM (source `to` target) . rows
+      >>= writeTSV (takeDirectory target </> "files" <.> "tsv") . asDocument
     run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY"
diff --git a/scripts/parallel-links.hs b/scripts/parallel-links.hs
index 9a7b85e..2081d76 100755
--- a/scripts/parallel-links.hs
+++ b/scripts/parallel-links.hs
@@ -3,23 +3,26 @@
 module Main where
 
 import Data.Char as Text (toLower)
-import Data.Text as Text (map)
+import Data.Map (Map)
+import Data.Text as Text (Text, map)
 import GEODE.Metadata
-  ( Entry(..), Has(..), ArticleRecord(..), type(@), groupBy, readNamedTsv, tsvFile )
+  ( type(@), ArticleRecord(..), Document(..), Entry(..), Has(..), ReadTSV(..)
+  , WriteTSV(..), groupBy )
+import Data.Vector as Vector (fromList)
 import System.Environment (getArgs)
-import System.Exit (die)
 import System.Script (syntax)
 
-type Line = ArticleRecord @ Entry
+type Line = ArticleRecord @ Entry @ Map Text Text
 
-findDiachronicPairs :: Foldable t => Bool -> t Line -> [Line]
-findDiachronicPairs caseInsensitive =
-  concatMap snd . filter isPair . groupBy (normalize . headword . get)
+findDiachronicPairs :: Bool -> Document Line -> Document Line
+findDiachronicPairs caseInsensitive d@(Document {rows}) =
+  d { rows = Vector.fromList . concatMap snd $ filter isPair groups  }
   where
     isPair = oneInEach . groupBy (work.get) . snd
     oneInEach = (&&) <$> twoBooks <*> oneByBook
     twoBooks = ((2 ==) . length)
     oneByBook = all ((1 ==) . length . snd)
+    groups = groupBy (normalize . headword . get) rows
     normalize
       | caseInsensitive = Text.map toLower
       | otherwise = id
@@ -28,8 +31,7 @@ main :: IO ()
 main = getArgs >>= popCaseInsensitive run
   where
     run caseInsensitive [input, output] =
-      readNamedTsv input
-      >>= either die (tsvFile output . findDiachronicPairs caseInsensitive)
+      readTSV input >>= writeTSV output . findDiachronicPairs caseInsensitive
     run _ _ = syntax "[-i] SOURCE_METADATA.tsv TARGET_METADATA.tsv"
     popCaseInsensitive f ("-i":args) = f True args
     popCaseInsensitive f args = f False args
diff --git a/scripts/sample.hs b/scripts/sample.hs
index 50a54ff..3145036 100755
--- a/scripts/sample.hs
+++ b/scripts/sample.hs
@@ -2,10 +2,10 @@
 import Data.IntSet as IntSet (IntSet, delete, fromList, insert, member, toList)
 import Data.Text (Text)
 import Data.Vector as Vector ((!), Vector, fromList)
-import GEODE.Metadata (readTsv, tsvLines)
+import GEODE.Metadata (ReadTSV(..), WriteTSV(..))
 import System.Environment (getArgs)
 import System.Random (randomRIO)
-import System.Script (syntax, try)
+import System.Script (syntax)
 
 sampleIndices :: Int -> (Int, Int) -> IO IntSet
 sampleIndices count (from, to)
@@ -24,7 +24,7 @@ sampleIndices count (from, to)
     build 0 result = pure result
     build n set = build (n-1) =<< pickNewOne set
 
-sample :: Int -> Vector a -> IO (Vector a)
+sample :: Int -> Vector [Text] -> IO (Vector [Text])
 sample count rows = Vector.fromList <$> sampledList
   where
     sampledList
@@ -37,7 +37,5 @@ main :: IO ()
 main = getArgs >>= run
   where
     run [size, inputMetadata] =
-        (try (readTsv inputMetadata) :: IO (Vector [Text]))
-      >>= sample (read size)
-      >>= tsvLines
+        readTSV inputMetadata >>= sample (read size) >>= writeTSV ()
     run _ = syntax "SIZE INPUT_METADATA"
diff --git a/scripts/subcorpus/get-files.hs b/scripts/subcorpus/get-files.hs
index 1b0167d..c6a267b 100755
--- a/scripts/subcorpus/get-files.hs
+++ b/scripts/subcorpus/get-files.hs
@@ -1,14 +1,13 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 
-import Data.Csv (FromNamedRecord)
-import Data.Vector (Vector)
-import GEODE.Metadata (ArticleRecord, Record(..), readNamedTsv)
+import Control.Monad.Except (ExceptT, runExceptT)
+import GEODE.Metadata (ArticleRecord, Document(..), ReadTSV(..), Record(..))
 import GEODE.Metadata.ParagraphRecord (ParagraphRecord)
 import System.Directory (createDirectoryIfMissing, doesFileExist)
 import System.Environment (getArgs)
 import System.FilePath ((</>), takeDirectory)
 import System.Posix.Files (createLink)
-import System.Script (syntax, try)
+import System.Script (syntax)
 
 link :: Record a => String -> FilePath -> FilePath -> a -> IO ()
 link extension sourceRoot targetRoot record = do
@@ -24,17 +23,16 @@ link extension sourceRoot targetRoot record = do
     sourceFile = sourceRoot </> fileName
     targetFile = targetRoot </> fileName
 
-type As a = a -> IO ()
+type As a = Document a -> IO ()
 
 run :: [String] -> IO ()
 run [files, extension, source, target] =
-  tsvData >>= either reTryAsArticles (mapM_ (mkLink :: As ParagraphRecord))
+  runExceptT (readTSV files :: ExceptT String IO (Document ParagraphRecord))
+    >>= either reTryAsArticles (mkLinks :: As ParagraphRecord)
   where
-    tsvData :: (FromNamedRecord a, Record a) => IO (Either String (Vector a))
-    tsvData = readNamedTsv files
-    mkLink :: Record a => As a
-    mkLink = link extension source target
-    reTryAsArticles _ = try tsvData >>= (mapM_ (mkLink :: As ArticleRecord))
+    mkLinks :: Record a => As a
+    mkLinks = mapM_ (link extension source target) . rows
+    reTryAsArticles _ = readTSV files >>= (mkLinks :: As ArticleRecord)
 
 run _ = syntax "FILES.tsv EXTENSION SOURCE_DIRECTORY TARGET_DIRECTORY"
 
diff --git a/scripts/textometry/computeProfile.hs b/scripts/textometry/compute-profile.hs
similarity index 80%
rename from scripts/textometry/computeProfile.hs
rename to scripts/textometry/compute-profile.hs
index 51cfcbb..4ac3480 100755
--- a/scripts/textometry/computeProfile.hs
+++ b/scripts/textometry/compute-profile.hs
@@ -7,22 +7,21 @@ import Conllu.Tree
 import Control.Applicative ((<**>))
 import Control.Monad.IO.Class (liftIO)
 import Control.Monad.Reader (ReaderT, asks, runReaderT)
-import Data.Csv (ToNamedRecord(..))
+import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
 import Data.ByteString as ByteString (readFile)
 import Data.List (sort)
 import Data.Map as Map (lookup)
 import Data.Serialize (decode)
 import Data.Tree (Tree(..))
 import GEODE.Metadata
-  ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
-  , WithDefaultHeader(..), glue, readNamedTsv, tsvLines )
-import GEODE.Metadata.TSV.Header (for, getHeader)
+  ( type(@), ArticleRecord, Document(..), ReadTSV(..), Record(..)
+  , WithDefaultHeader, WriteTSV(..), for, getHeader, glue )
 import GHC.Generics (Generic)
 import Options.Applicative
   ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
   , short, strOption )
 import System.FilePath ((</>))
-import System.Script (try, warn)
+import System.Script (warn)
 
 data Config = Config
   { inputRoot :: FilePath
@@ -46,13 +45,12 @@ getConfig = execParser
     <> progDesc "A textometric tool to draw discursive profiles"))
 
 data Occurrence = Occurrence
-  { totalSize :: Int
-  , position :: Int
-  , size :: Int } deriving Generic
+  { position :: Int
+  , size :: Int
+  , totalSize :: Int } deriving Generic
 
+instance DefaultOrdered Occurrence
 instance ToNamedRecord Occurrence
-instance HasDefaultHeader Occurrence where
-  defaultFields = DefaultFields ["position", "size", "totalSize"]
 
 type Measure = ArticleRecord @ Occurrence
 
@@ -67,7 +65,6 @@ profile articleRecord = do
 findOccurrences :: IndexedDocument -> [Occurrence]
 findOccurrences (IndexedDocument {_total, _sentences}) =
   groupOccurrences [] (_sentences >>= imperativeVerb)
-  --uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb)
   where
     groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence]
     groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack
@@ -92,7 +89,7 @@ imperativeVerb (IndexedSentence {_offset, _syntax}) =
 main :: IO ()
 main = getConfig >>= runReaderT chain
   where
-    chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
-    searchAndDisplay rows = do
-      liftIO $ tsvLines [getHeader (for :: Measure)]
-      mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
+    chain = asks inputTsv >>= liftIO . readTSV >>= searchAndDisplay
+    searchAndDisplay (Document {rows}) = do
+      liftIO $ writeTSV () [getHeader (for :: Measure)]
+      mapM_ (\ar -> profile ar >>= liftIO . writeTSV ()) rows
diff --git a/scripts/textometry/conllu-to-lexicoscope.hs b/scripts/textometry/conllu-to-lexicoscope.hs
new file mode 100755
index 0000000..0ceb48b
--- /dev/null
+++ b/scripts/textometry/conllu-to-lexicoscope.hs
@@ -0,0 +1,111 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
+{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}
+
+import Conllu.Parse (parseConllu)
+import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
+import Conllu.Print (printSent)
+import Conllu.Type (Sent(..))
+import Control.Monad.State (StateT(..), evalStateT)
+import Data.List as List (intercalate)
+import GEODE.Metadata
+  ( type(@), ArticleRecord(..), Contrastive(..), Document(..), Entry(..), Has(..)
+  , MultiText(..), ReadTSV(..), Record(..), Work, formatList, groupBy, sortBy)
+import Data.Text as Text (unpack)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.FilePath ((</>), (<.>))
+import System.Script (syntax)
+import Text.XML.HXT.Core
+  ((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem
+  , txt, withIndent, writeDocument)
+import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
+import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
+
+type Row = ArticleRecord @ Entry @ Contrastive
+
+type Serial = (Int, Int)
+type WithSerial a = StateT Serial IO a
+type SerialArrow n a = IOStateArrow Serial n a
+
+next :: Selector Serial Int -> SerialArrow n String
+next intSelector = IOSLA getUpdate
+  where
+    getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
+    field = theUserState >>> intSelector
+
+pId :: Selector Serial Int
+pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
+
+stateOfArrow :: IOStateArrow s () a -> StateT s IO ()
+stateOfArrow a = StateT $ \s0 -> do
+  (s1, _) <- runIOSLA a (initialState s0) ()
+  pure ((), getS theUserState s1)
+
+to :: FilePath -> FilePath -> (Work, [Row]) -> WithSerial ()
+to source target (theWork, metadata) =
+  mapM_ runArrow . groupBy (volume.get) $ sortBy (article.get) metadata
+  where
+    runArrow (volumeNumber, articles) = stateOfArrow $
+      compileTome source target (theWork, volumeNumber, articles)
+
+compileTome ::
+  FilePath -> FilePath -> (Work, Int, [Row]) -> SerialArrow n XmlTree
+compileTome source target (theWork, volumeNumber, metadata) =
+  volumeXml >>> writeDocument [withIndent True] output
+  where
+    strTome = show volumeNumber
+    output = target </> (show theWork) <> "_T" <> strTome <.> ".fr.xml"
+    volumeXml = selem "/"
+                [ selem "teiCorpus"
+                --[ selem "corpus"
+                  [ articleFrom source $< constL metadata ] ]
+
+articleFrom :: FilePath -> Row -> SerialArrow n XmlTree
+articleFrom source row =
+  selem "doc"
+    [ selem "meta" [metaFrom row]
+    , selem "text" [loadConllu input] ]
+  where
+    input = source </> relativePath (get row :: ArticleRecord) "conllu"
+
+loadConllu :: FilePath -> SerialArrow n XmlTree
+loadConllu input =
+  arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
+  where
+    debug msg = die $ "In file " <> input <> "\n" <> msg
+    format = formatParagraph $< arrL byParagraph
+
+metaFrom :: ArrowXml a => Row -> a n XmlTree
+metaFrom row = txt $
+  concatMap (++ "\n")
+    (List.intercalate "\t" <$>
+      [[]
+      ,["fileName", toUID (get row :: ArticleRecord)]
+      ,["work", show . work $ get row]
+      ,["volume", show . volume $ get row]
+      ,["article", show . article $ get row]
+      ,["head", Text.unpack . headword $ get row]
+      ,["author", Text.unpack . formatList . authors $ get row]
+      ,["domains", Text.unpack . formatList . domains $ get row]
+      ,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get row ]
+      ,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get row ]])
+
+formatParagraph :: Paragraph -> SerialArrow n XmlTree
+formatParagraph (Paragraph sents) =
+  mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
+
+formatSentence :: Sent -> SerialArrow n XmlTree
+formatSentence s =
+  selem "s" [txt sentences]
+  where
+    sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    withSerial = (`evalStateT` (0, 0))
+    works = groupBy (work.get)
+    cli [sourceMeta, sourceRoot, targetRoot] =
+      readTSV sourceMeta
+        >>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . works) . rows
+    cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/conllu-to-txm.hs b/scripts/textometry/conllu-to-txm.hs
similarity index 67%
rename from scripts/conllu-to-txm.hs
rename to scripts/textometry/conllu-to-txm.hs
index 69266da..298a203 100755
--- a/scripts/conllu-to-txm.hs
+++ b/scripts/textometry/conllu-to-txm.hs
@@ -1,17 +1,17 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 {-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}
 
 import Conllu.Parse (parseConllu)
 import Conllu.Type (AW, CW(..), ID(..), Sent(..))
 import Control.Applicative ((<|>), liftA2)
 import GEODE.Metadata
-  ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..)
-  , type(@), relativePath, groupBy, readNamedTsv, sortBy )
+  ( type(@)(..), ArticleRecord(..), Contrastive(..), Document(..), Entry(..), Has(..)
+  , MultiText(..), ReadTSV(..), Record(..), Work, groupBy, sortBy )
 import Data.Text as Text (breakOn, drop, pack, unpack, splitOn)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.FilePath ((</>), (<.>))
-import System.Script (syntax, try)
+import System.Script (syntax)
 import Text.TEI
   ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
 import Text.XML.HXT.Core
@@ -19,39 +19,39 @@ import Text.XML.HXT.Core
   , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent
   , writeDocument )
 
-type Article = PrimaryKey @ Entry @ Contrastive
+type Row = ArticleRecord @ Entry @ Contrastive
 
-to :: FilePath -> FilePath -> (Book, [Article]) -> IO ()
-to source target (theBook, metadata) =
-  mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
+to :: FilePath -> FilePath -> (Work, [Row]) -> IO ()
+to source target (theWork, metadata) =
+  mapM_ runArrow . groupBy (volume.get) $ sortBy (article.get) metadata
   where
-    runArrow (tomeNumber, tomeArticles) =
-      runX $ compileTome source target (theBook, tomeNumber, tomeArticles)
+    runArrow (volumeNumber, articles) =
+      runX $ compileTome source target (theWork, volumeNumber, articles)
 
 compileTome ::
-  FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree
-compileTome source target (theBook, tomeNumber, metadata) =
-  tomeXml >>> writeDocument [withIndent True] output
+  FilePath -> FilePath -> (Work, Int, [Row]) -> IOSArrow b XmlTree
+compileTome source target (theWork, volumeNumber, metadata) =
+  volumeXml >>> writeDocument [withIndent True] output
   where
-    strTome = show tomeNumber
-    output = target </> (show theBook) <> "_T" <> strTome <.> "xml"
-    tomeXml = selem "/"
+    strTome = show volumeNumber
+    output = target </> (show theWork) <> "_T" <> strTome <.> "xml"
+    volumeXml = selem "/"
                 [ selem "teiCorpus"
-                  [ corpusHeader theBook strTome
+                  [ corpusHeader theWork strTome
                   , (articleFrom source $< constL metadata) ] ]
 
-articleFrom :: FilePath -> Article -> IOSArrow b XmlTree
-articleFrom source article =
+articleFrom :: FilePath -> Row -> IOSArrow b XmlTree
+articleFrom source row =
   selem "TEI"
     [ teiHeader
-        (selem "titleStmt" [title . Text.unpack . headWord.get $ article])
+        (selem "titleStmt" [title . Text.unpack . headword.get $ row])
         (publicationStmt
           [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ])
         (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
-    , text "article" article (loadConllu input) ]
+    , text "article" row (loadConllu input) ]
   where
-    input = source </> relativePath article "conllu"
-    authorArrow = constL (Text.unpack <$> (getList . authors $ get article))
+    input = source </> relativePath (get row :: ArticleRecord) "conllu"
+    authorArrow = constL (Text.unpack <$> (getList . authors $ get row))
 
 loadConllu :: FilePath -> IOSArrow b XmlTree
 loadConllu input =
@@ -89,7 +89,7 @@ misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack)
 main :: IO ()
 main = getArgs >>= cli
   where
-    books = groupBy (book.get)
+    works = groupBy (work.get)
     cli [inputMeta, sourceRoot, targetRoot] =
-      try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books
+      readTSV inputMeta >>= mapM_ (sourceRoot `to` targetRoot) . works . rows
     cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"
diff --git a/scripts/textometry/topNOUN.hs b/scripts/textometry/top-noun.hs
similarity index 86%
rename from scripts/textometry/topNOUN.hs
rename to scripts/textometry/top-noun.hs
index f2134f9..88bc3ec 100755
--- a/scripts/textometry/topNOUN.hs
+++ b/scripts/textometry/top-noun.hs
@@ -14,14 +14,13 @@ import Data.Maybe (listToMaybe)
 import Data.Serialize (decode)
 import Data.Tree (Tree(..))
 import GEODE.Metadata
-  ( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue
-  , readNamedTsv, tsvLines )
-import GEODE.Metadata.TSV.Header (for, getHeader)
+  ( type(@), ArticleRecord, Document(..), ReadTSV(..), Record(..)
+  , WithDefaultHeader(..), WriteTSV(..), for, getHeader, glue )
 import Options.Applicative
   ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
   , short, strOption )
 import System.FilePath ((</>))
-import System.Script (try, warn)
+import System.Script (warn)
 
 data Config = Config
   { inputRoot :: FilePath
@@ -76,7 +75,7 @@ firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop
 main :: IO ()
 main = getConfig >>= runReaderT chain
   where
-    chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
-    searchAndDisplay rows = do
-      liftIO $ tsvLines [getHeader (for :: Result)]
-      mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows
+    chain = asks inputTsv >>= liftIO . readTSV >>= searchAndDisplay
+    searchAndDisplay (Document {rows}) = do
+      liftIO $ writeTSV () [getHeader (for :: Result)]
+      mapM_ (\ar -> profile ar >>= liftIO . writeTSV () . maybe [] (:[])) rows
-- 
GitLab