From 1d6dee078741a91aaa29d11e6cdad61b0c92538d Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Thu, 14 Sep 2023 11:09:56 +0200
Subject: [PATCH] Forgot to prevent pandas from adding the annoying index
 column

---
 lib/Data/Metadata/Trie.hs                  | 15 +++--
 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs | 52 ---------------
 manifest.scm                               | 20 ++++--
 scripts/ML/predict.py                      |  2 +-
 scripts/subcorpus/getTXMQuery.hs           | 77 ++++++++++++++++++++--
 scripts/textometry/scoresByDomain.hs       | 26 +++++---
 6 files changed, 114 insertions(+), 78 deletions(-)
 delete mode 100644 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs

diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs
index 3ad51a6..7d33586 100644
--- a/lib/Data/Metadata/Trie.hs
+++ b/lib/Data/Metadata/Trie.hs
@@ -1,6 +1,13 @@
 {-# LANGUAGE NamedFieldPuns #-}
 module Data.Metadata.Trie
-  () where
+  --( Edges
+  --, Indexed
+  --, Trie(..)
+  --, Zipper(..)
+  --, at
+  --, index
+  --, trieOf ) where
+where
 
 import Data.Foldable (toList)
 import Data.Map (Map)
@@ -9,7 +16,7 @@ import qualified Data.Map as Map (delete, empty, insert, lookup, singleton)
 data Trie e n =
   Trie
     { store :: Maybe n
-    , edges :: Edges e n }
+    , edges :: Edges e n } deriving Show
 type Edges e n = Map e (Trie e n)
 
 data Zipper e n =
@@ -46,14 +53,14 @@ at trie = at_ trie . toList
 insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n
 insert path n trie = zipUp (edit (toList path) (trie, Top))
   where
-    edit [] (trie, zipper) = (trie {store = Just n}, zipper)
+    edit [] (subTrie, zipper) = (subTrie {store = Just n}, zipper)
     edit (e:es) (Trie {store, edges}, above) =
       let (subTree, otherEdges) = partition e edges in
       edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above})
     partition e edges =
       case Map.lookup e edges of
         Just subTree -> (subTree, Map.delete e edges)
-        _ -> (empty, Map.empty)
+        _ -> (empty, edges)
 
 type Indexed e n = ([e], n)
 
diff --git a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
deleted file mode 100644
index e0d906f..0000000
--- a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
+++ /dev/null
@@ -1,52 +0,0 @@
-{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
-module GEODE.Metadata.PrimaryKey.Paragraph
-  ( ParagraphPK(..) ) where
-
-import Data.Aeson ((.=), ToJSON(..), object, pairs)
-import Data.Csv
-  ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
-import GEODE.Metadata.File (File(..))
-import GEODE.Metadata as Article
-  (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
-import qualified GEODE.Metadata as Article (PrimaryKey(rank))
-import System.FilePath ((</>), (<.>), dropExtension)
-import Text.Printf (printf)
-
-data ParagraphPK = ParagraphPK
-  { article :: PrimaryKey
-  , rank :: Int } deriving (Eq, Ord, Show)
-
-instance FromNamedRecord ParagraphPK where
-  parseNamedRecord nr = ParagraphPK
-    <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
-    <*> nr .: "paragraph#"
-
-instance ToNamedRecord ParagraphPK where
-  toNamedRecord (ParagraphPK {article, rank}) = namedRecord
-    [ namedField "book" (book article)
-    , namedField "tome" (tome article)
-    , namedField "article#" (Article.rank article)
-    , namedField "paragraph#" rank ]
-
-instance File ParagraphPK where
-  uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
-
-  relativePath (ParagraphPK {article, rank}) extension =
-    articleDirectory </> (show rank) <.> extension
-    where
-      articleDirectory = dropExtension (Article.relativePath article "")
-
-instance HasDefaultHeader ParagraphPK where
-  defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
-
-instance ToJSON ParagraphPK where
-  toJSON (ParagraphPK {article, rank}) = object
-    [ "book" .= book article
-    , "tome" .= tome article
-    , "article#" .= Article.rank article
-    , "paragraph#" .= rank ]
-  toEncoding (ParagraphPK {article, rank}) = pairs
-    ( "book" .= book article
-    <> "tome" .= tome article
-    <> "article#" .= Article.rank article
-    <> "paragraph#" .= rank )
diff --git a/manifest.scm b/manifest.scm
index bf7afd5..6d28e6e 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -1,34 +1,44 @@
 (use-modules ((geode packages annotation) #:select (python-stanza))
+             ;((geode packages encoding) #:select (ghc-geode))
              ((geode packages models) #:select (stanza-fr))
+             ((gnu packages base) #:select (findutils sed))
              ((gnu packages commencement) #:select (gcc-toolchain))
              ((gnu packages haskell) #:select (ghc))
              ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
-             ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
+             ((gnu packages haskell-xyz) #:select (ghc-cassava
+                                                   ghc-hs-conllu
+                                                   ghc-random))
              ((gnu packages python) #:select (python))
              ((gnu packages python-science) #:select (python-pandas))
              ((gnu packages python-xyz) #:select (python-beautifulsoup4))
              ((gnu packages xml) #:select (python-lxml)))
 
 ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
-;(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
-;(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
+(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
+(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
+(define processing-lge (load "/home/alice/Logiciel/ProcessingLGE/guix.scm"))
+;(define soprano (load "/home/alice/Logiciel/soprano/guix.scm"))
 
 (packages->manifest
   (list
     coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
-    ;edda-clinic ; fix and cut the EDdA
+    edda-clinic ; fix and cut the EDdA
+    findutils ; retrieve ALTO pages in files from the BnF
     gcc-toolchain ; running haskell
     ghc ; running haskell
     ghc-aeson ; working with JSON in haskell
     ghc-cassava ; working with CSV in haskell
-    ;ghc-geode ; handling corpus files
+    ghc-geode ; handling corpus files
     ghc-hs-conllu ; working on syntax-annotated documents
     ghc-hxt ; working on xml documents
+    ghc-random ; sampling data at random
+    processing-lge ; extracting articles from the BnF files
     python ; scripts
     python-beautifulsoup4 ; extract EDdA metadata from TEI files
     ;python-edda ; TODO
     python-lxml ; fusion articles into tomes for TXM
     python-pandas ; working with CSV in python
     python-stanza ; annotation
+    sed ; select files from listing
     stanza-fr ; annotation
     ))
diff --git a/scripts/ML/predict.py b/scripts/ML/predict.py
index e95b305..974fc69 100644
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
@@ -120,4 +120,4 @@ def label(classify, source, tsv_path, name='label'):
 if __name__ == '__main__':
     classify = Classifier(argv[1])
     source = Source(argv[2])
-    label(classify, source, argv[3]).to_csv(argv[4], sep='\t')
+    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
diff --git a/scripts/subcorpus/getTXMQuery.hs b/scripts/subcorpus/getTXMQuery.hs
index 854bf6a..93a5b77 100755
--- a/scripts/subcorpus/getTXMQuery.hs
+++ b/scripts/subcorpus/getTXMQuery.hs
@@ -1,17 +1,82 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
 {-# LANGUAGE OverloadedStrings #-}
 
-import Data.Text as Text (Text, pack, intercalate)
+import Data.Map (Map, toList)
+import Data.Metadata.Trie (Indexed, Trie(..), index)
+import Data.Text as Text (Text, concat, cons, pack, intercalate, singleton, snoc)
 import Data.Text.IO as Text (putStr)
-import GEODE.Metadata (PrimaryKey, uid, readNamedTsv)
+import GEODE.Metadata (Book, PrimaryKey(..), readNamedTsv)
 import System.Environment (getArgs)
 import System.Script (syntax, try)
-import Text.Printf (printf)
 
-txmQuery :: Foldable t => t PrimaryKey -> Text
-txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get)
+data MetaPath = Book Book | TomeDigit Char | RankDigit Char | Separator deriving (Eq, Ord, Show)
+
+data RegExp =
+    Symbol Char
+  | Range (Char, Char)
+  | Union [RegExp]
+  | Concat [RegExp]
+  | Optional RegExp
+
+size :: RegExp -> Int
+size (Symbol _) = 1
+size (Range _) = 5
+size (Union l) = length l + 1 + getSum $ foldMap (Sum . size) l
+size (Concat l) = length l + 1 + getSum $ foldMap (Sum . size) l
+
+compile :: RegExp -> Text
+compile (Symbol c) = Text.singleton c
+compile (Range (a, b))
+  | a > b = compile $ Range (b, a)
+  | otherwise = Text.concat ["[", Text.singleton a, "-", Text.singleton b, "]"]
+compile Union rs =
+  between ('(', ')') . Text.intercalate "|" $ compile <$> rs
+compile Concat rs = Text.concat rs
+compile Optional c@(Concat _) = compile c <> "?"
+compile Optional _ = compile r <> "?"
+
+pathIndexer :: PrimaryKey -> Indexed MetaPath PrimaryKey
+pathIndexer pk@(PrimaryKey {book, tome, rank}) =
+  ( Book book : Separator : (TomeDigit <$> show tome)
+      ++ (Separator : (RankDigit <$> show rank))
+  , pk )
+
+between :: (Char, Char) -> Text -> Text
+between (open, close) inside = open `cons` inside `snoc` close
+
+txmQuery :: (Foldable t, Functor t) => t PrimaryKey -> Text
+txmQuery = queryOfTrie . index pathIndexer
   where
-    select = Text.pack . printf "a.text_uid=\"%s\"" . uid
+    queryOfTrie =
+      ("/region[text,a]:: a.text_uid=" <>) . between ('"', '"') . toRegex
+
+class Regexable a where
+  toRegex :: a -> RegExp
+
+instance Regexable e => Regexable (Trie e a) where
+  toRegex (Trie {store = Nothing, edges}) = toRegex edges
+  toRegex trie@(Trie {store = Just _, edges})
+    | length edges == 0 = Concat []
+    | otherwise = Optional (toRegex (trie {store = Nothing}))
+
+instance Regexable MetaPath where
+  toRegex (Book b) = Concat (Symbol <$> show b)
+  toRegex Separator = Symbol '_'
+  toRegex (TomeDigit c) = System c
+  toRegex (RankDigit c) = System c
+
+instance (Regexable a, Regexable e) => Regexable (Map e a) where
+  toRegex = toRegex . toList
+
+instance (Regexable a, Regexable e) => Regexable (e, a) where
+  toRegex (e, a) = Concat [toRegex e, toRegex a]
+
+{-
+instance (Regexable a) => Regexable [a] where
+  toRegex [] = ""
+  toRegex [a] = toRegex a
+  toRegex l = between ('(', ')') . Text.intercalate "|" $ toRegex <$> l
+-}
 
 main :: IO ()
 main = getArgs >>= run
diff --git a/scripts/textometry/scoresByDomain.hs b/scripts/textometry/scoresByDomain.hs
index 226dc01..8c0c79b 100755
--- a/scripts/textometry/scoresByDomain.hs
+++ b/scripts/textometry/scoresByDomain.hs
@@ -2,18 +2,22 @@
 {-# LANGUAGE OverloadedStrings #-}
 
 import Data.Text as Text
-  ( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
-  , snoc, splitOn )
+  ( Text, drop, dropEnd, intercalate, isPrefixOf, length, lines )
+  --( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
+  --, snoc, splitOn )
 import Data.Text.IO as Text (getLine, interact, putStrLn)
 import System.Exit (die)
+import Text.TSV (fromTsvLine, toTsvLine)
 
 data ColumnSelector = ColumnSelector
   { position :: Int
   , name :: Text } deriving Show
 newtype Header = Header [ColumnSelector] deriving Show
 
-getColumns :: Text -> [Text]
-getColumns = splitOn "\t"
+{-
+fromTsvLine :: Text -> [Text]
+fromTsvLine = splitOn "\t"
+-}
 
 getHeader :: [Text] -> IO Header
 getHeader [] = die "no columns"
@@ -29,21 +33,23 @@ getHeader (first:otherColumns) = pure $
       | otherwise = tmp
     score = "score_:"
 
-toTsv :: [Text] -> Text
-toTsv [] = ""
-toTsv (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
+{-
+toTsvLine :: [Text] -> Text
+toTsvLine [] = ""
+toTsvLine (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
   where
     escape s = '"' `cons` replace "\"" "\"\"" s `snoc` '"'
+-}
 
 filterLine :: [Int] -> Text -> Text
-filterLine positions = toTsv . extract . getColumns
+filterLine positions = toTsvLine . extract . fromTsvLine
   where
     extract columns = (columns !!) <$> positions
 
 main :: IO ()
 main = do
-  Header header <- getHeader.getColumns =<< Text.getLine
-  Text.putStrLn . toTsv $ name <$> header
+  Header header <- getHeader.fromTsvLine =<< Text.getLine
+  Text.putStrLn . toTsvLine $ name <$> header
   Text.interact
     (textUnlines . map (filterLine $ position <$> header) . Text.lines)
   where
-- 
GitLab