Forgot to prevent pandas from adding the annoying index column

1d6dee07 · Alice Brenon · a12c79c3 · 1d6dee07 · a12c79c3 · 1d6dee07
Commit 1d6dee07 authored Sep 14, 2023 by Alice Brenon
--- a/lib/Data/Metadata/Trie.hs
+++ b/lib/Data/Metadata/Trie.hs
 {-# LANGUAGE NamedFieldPuns #-}
 module Data.Metadata.Trie
-  () where
+  --( Edges
+  --, Indexed
+  --, Trie(..)
+  --, Zipper(..)
+  --, at
+  --, index
+  --, trieOf ) where
+where

 import Data.Foldable (toList)
 import Data.Map (Map)
@@ -9,7 +16,7 @@ import qualified Data.Map as Map (delete, empty, insert, lookup, singleton)
 data Trie e n =
  Trie
    { store :: Maybe n
-    , edges :: Edges e n }
+    , edges :: Edges e n } deriving Show
 type Edges e n = Map e (Trie e n)

 data Zipper e n =
@@ -46,14 +53,14 @@ at trie = at_ trie . toList
 insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n
 insert path n trie = zipUp (edit (toList path) (trie, Top))
  where
-    edit [] (trie, zipper) = (trie {store = Just n}, zipper)
+    edit [] (subTrie, zipper) = (subTrie {store = Just n}, zipper)
    edit (e:es) (Trie {store, edges}, above) =
      let (subTree, otherEdges) = partition e edges in
      edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above})
    partition e edges =
      case Map.lookup e edges of
        Just subTree -> (subTree, Map.delete e edges)
-        _ -> (empty, Map.empty)
+        _ -> (empty, edges)

 type Indexed e n = ([e], n)


--- a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
+++ b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
-{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
-module GEODE.Metadata.PrimaryKey.Paragraph
-  ( ParagraphPK(..) ) where
-
-import Data.Aeson ((.=), ToJSON(..), object, pairs)
-import Data.Csv
-  ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
-import GEODE.Metadata.File (File(..))
-import GEODE.Metadata as Article
-  (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
-import qualified GEODE.Metadata as Article (PrimaryKey(rank))
-import System.FilePath ((</>), (<.>), dropExtension)
-import Text.Printf (printf)
-
-data ParagraphPK = ParagraphPK
-  { article :: PrimaryKey
-  , rank :: Int } deriving (Eq, Ord, Show)
-
-instance FromNamedRecord ParagraphPK where
-  parseNamedRecord nr = ParagraphPK
-    <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
-    <*> nr .: "paragraph#"
-
-instance ToNamedRecord ParagraphPK where
-  toNamedRecord (ParagraphPK {article, rank}) = namedRecord
-    [ namedField "book" (book article)
-    , namedField "tome" (tome article)
-    , namedField "article#" (Article.rank article)
-    , namedField "paragraph#" rank ]
-
-instance File ParagraphPK where
-  uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
-
-  relativePath (ParagraphPK {article, rank}) extension =
-    articleDirectory </> (show rank) <.> extension
-    where
-      articleDirectory = dropExtension (Article.relativePath article "")
-
-instance HasDefaultHeader ParagraphPK where
-  defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
-
-instance ToJSON ParagraphPK where
-  toJSON (ParagraphPK {article, rank}) = object
-    [ "book" .= book article
-    , "tome" .= tome article
-    , "article#" .= Article.rank article
-    , "paragraph#" .= rank ]
-  toEncoding (ParagraphPK {article, rank}) = pairs
-    ( "book" .= book article
-    <> "tome" .= tome article
-    <> "article#" .= Article.rank article
-    <> "paragraph#" .= rank )
--- a/manifest.scm
+++ b/manifest.scm
 (use-modules ((geode packages annotation) #:select (python-stanza))
+             ;((geode packages encoding) #:select (ghc-geode))
             ((geode packages models) #:select (stanza-fr))
+             ((gnu packages base) #:select (findutils sed))
             ((gnu packages commencement) #:select (gcc-toolchain))
             ((gnu packages haskell) #:select (ghc))
             ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
-             ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
+             ((gnu packages haskell-xyz) #:select (ghc-cassava
+                                                   ghc-hs-conllu
+                                                   ghc-random))
             ((gnu packages python) #:select (python))
             ((gnu packages python-science) #:select (python-pandas))
             ((gnu packages python-xyz) #:select (python-beautifulsoup4))
             ((gnu packages xml) #:select (python-lxml)))

 ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
-;(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
-;(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
+(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
+(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
+(define processing-lge (load "/home/alice/Logiciel/ProcessingLGE/guix.scm"))
+;(define soprano (load "/home/alice/Logiciel/soprano/guix.scm"))

 (packages->manifest
  (list
    coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
-    ;edda-clinic ; fix and cut the EDdA
+    edda-clinic ; fix and cut the EDdA
+    findutils ; retrieve ALTO pages in files from the BnF
    gcc-toolchain ; running haskell
    ghc ; running haskell
    ghc-aeson ; working with JSON in haskell
    ghc-cassava ; working with CSV in haskell
-    ;ghc-geode ; handling corpus files
+    ghc-geode ; handling corpus files
    ghc-hs-conllu ; working on syntax-annotated documents
    ghc-hxt ; working on xml documents
+    ghc-random ; sampling data at random
+    processing-lge ; extracting articles from the BnF files
    python ; scripts
    python-beautifulsoup4 ; extract EDdA metadata from TEI files
    ;python-edda ; TODO
    python-lxml ; fusion articles into tomes for TXM
    python-pandas ; working with CSV in python
    python-stanza ; annotation
+    sed ; select files from listing
    stanza-fr ; annotation
    ))
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
@@ -120,4 +120,4 @@ def label(classify, source, tsv_path, name='label'):
 if __name__ == '__main__':
    classify = Classifier(argv[1])
    source = Source(argv[2])
-    label(classify, source, argv[3]).to_csv(argv[4], sep='\t')
+    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
--- a/scripts/subcorpus/getTXMQuery.hs
+++ b/scripts/subcorpus/getTXMQuery.hs
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
 {-# LANGUAGE OverloadedStrings #-}

-import Data.Text as Text (Text, pack, intercalate)
+import Data.Map (Map, toList)
+import Data.Metadata.Trie (Indexed, Trie(..), index)
+import Data.Text as Text (Text, concat, cons, pack, intercalate, singleton, snoc)
 import Data.Text.IO as Text (putStr)
-import GEODE.Metadata (PrimaryKey, uid, readNamedTsv)
+import GEODE.Metadata (Book, PrimaryKey(..), readNamedTsv)
 import System.Environment (getArgs)
 import System.Script (syntax, try)
-import Text.Printf (printf)

-txmQuery :: Foldable t => t PrimaryKey -> Text
-txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get)
+data MetaPath = Book Book | TomeDigit Char | RankDigit Char | Separator deriving (Eq, Ord, Show)
+
+data RegExp =
+    Symbol Char
+  | Range (Char, Char)
+  | Union [RegExp]
+  | Concat [RegExp]
+  | Optional RegExp
+
+size :: RegExp -> Int
+size (Symbol _) = 1
+size (Range _) = 5
+size (Union l) = length l + 1 + getSum $ foldMap (Sum . size) l
+size (Concat l) = length l + 1 + getSum $ foldMap (Sum . size) l
+
+compile :: RegExp -> Text
+compile (Symbol c) = Text.singleton c
+compile (Range (a, b))
+  | a > b = compile $ Range (b, a)
+  | otherwise = Text.concat ["[", Text.singleton a, "-", Text.singleton b, "]"]
+compile Union rs =
+  between ('(', ')') . Text.intercalate "|" $ compile <$> rs
+compile Concat rs = Text.concat rs
+compile Optional c@(Concat _) = compile c <> "?"
+compile Optional _ = compile r <> "?"
+
+pathIndexer :: PrimaryKey -> Indexed MetaPath PrimaryKey
+pathIndexer pk@(PrimaryKey {book, tome, rank}) =
+  ( Book book : Separator : (TomeDigit <$> show tome)
+      ++ (Separator : (RankDigit <$> show rank))
+  , pk )
+
+between :: (Char, Char) -> Text -> Text
+between (open, close) inside = open `cons` inside `snoc` close
+
+txmQuery :: (Foldable t, Functor t) => t PrimaryKey -> Text
+txmQuery = queryOfTrie . index pathIndexer
  where
-    select = Text.pack . printf "a.text_uid=\"%s\"" . uid
+    queryOfTrie =
+      ("/region[text,a]:: a.text_uid=" <>) . between ('"', '"') . toRegex
+
+class Regexable a where
+  toRegex :: a -> RegExp
+
+instance Regexable e => Regexable (Trie e a) where
+  toRegex (Trie {store = Nothing, edges}) = toRegex edges
+  toRegex trie@(Trie {store = Just _, edges})
+    | length edges == 0 = Concat []
+    | otherwise = Optional (toRegex (trie {store = Nothing}))
+
+instance Regexable MetaPath where
+  toRegex (Book b) = Concat (Symbol <$> show b)
+  toRegex Separator = Symbol '_'
+  toRegex (TomeDigit c) = System c
+  toRegex (RankDigit c) = System c
+
+instance (Regexable a, Regexable e) => Regexable (Map e a) where
+  toRegex = toRegex . toList
+
+instance (Regexable a, Regexable e) => Regexable (e, a) where
+  toRegex (e, a) = Concat [toRegex e, toRegex a]
+
+{-
+instance (Regexable a) => Regexable [a] where
+  toRegex [] = ""
+  toRegex [a] = toRegex a
+  toRegex l = between ('(', ')') . Text.intercalate "|" $ toRegex <$> l
+-}

 main :: IO ()
 main = getArgs >>= run

--- a/scripts/textometry/scoresByDomain.hs
+++ b/scripts/textometry/scoresByDomain.hs
@@ -2,18 +2,22 @@
 {-# LANGUAGE OverloadedStrings #-}

 import Data.Text as Text
-  ( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
-  , snoc, splitOn )
+  ( Text, drop, dropEnd, intercalate, isPrefixOf, length, lines )
+  --( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
+  --, snoc, splitOn )
 import Data.Text.IO as Text (getLine, interact, putStrLn)
 import System.Exit (die)
+import Text.TSV (fromTsvLine, toTsvLine)

 data ColumnSelector = ColumnSelector
  { position :: Int
  , name :: Text } deriving Show
 newtype Header = Header [ColumnSelector] deriving Show

-getColumns :: Text -> [Text]
-getColumns = splitOn "\t"
+{-
+fromTsvLine :: Text -> [Text]
+fromTsvLine = splitOn "\t"
+-}

 getHeader :: [Text] -> IO Header
 getHeader [] = die "no columns"
@@ -29,21 +33,23 @@ getHeader (first:otherColumns) = pure $
      | otherwise = tmp
    score = "score_:"

-toTsv :: [Text] -> Text
-toTsv [] = ""
-toTsv (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
+{-
+toTsvLine :: [Text] -> Text
+toTsvLine [] = ""
+toTsvLine (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
  where
    escape s = '"' `cons` replace "\"" "\"\"" s `snoc` '"'
+-}

 filterLine :: [Int] -> Text -> Text
-filterLine positions = toTsv . extract . getColumns
+filterLine positions = toTsvLine . extract . fromTsvLine
  where
    extract columns = (columns !!) <$> positions

 main :: IO ()
 main = do
-  Header header <- getHeader.getColumns =<< Text.getLine
-  Text.putStrLn . toTsv $ name <$> header
+  Header header <- getHeader.fromTsvLine =<< Text.getLine
+  Text.putStrLn . toTsvLine $ name <$> header
  Text.interact
    (textUnlines . map (filterLine $ position <$> header) . Text.lines)
  where