From 1d6dee078741a91aaa29d11e6cdad61b0c92538d Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 14 Sep 2023 11:09:56 +0200 Subject: [PATCH] Forgot to prevent pandas from adding the annoying index column --- lib/Data/Metadata/Trie.hs | 15 +++-- lib/GEODE/Metadata/PrimaryKey/Paragraph.hs | 52 --------------- manifest.scm | 20 ++++-- scripts/ML/predict.py | 2 +- scripts/subcorpus/getTXMQuery.hs | 77 ++++++++++++++++++++-- scripts/textometry/scoresByDomain.hs | 26 +++++--- 6 files changed, 114 insertions(+), 78 deletions(-) delete mode 100644 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs index 3ad51a6..7d33586 100644 --- a/lib/Data/Metadata/Trie.hs +++ b/lib/Data/Metadata/Trie.hs @@ -1,6 +1,13 @@ {-# LANGUAGE NamedFieldPuns #-} module Data.Metadata.Trie - () where + --( Edges + --, Indexed + --, Trie(..) + --, Zipper(..) + --, at + --, index + --, trieOf ) where +where import Data.Foldable (toList) import Data.Map (Map) @@ -9,7 +16,7 @@ import qualified Data.Map as Map (delete, empty, insert, lookup, singleton) data Trie e n = Trie { store :: Maybe n - , edges :: Edges e n } + , edges :: Edges e n } deriving Show type Edges e n = Map e (Trie e n) data Zipper e n = @@ -46,14 +53,14 @@ at trie = at_ trie . toList insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n insert path n trie = zipUp (edit (toList path) (trie, Top)) where - edit [] (trie, zipper) = (trie {store = Just n}, zipper) + edit [] (subTrie, zipper) = (subTrie {store = Just n}, zipper) edit (e:es) (Trie {store, edges}, above) = let (subTree, otherEdges) = partition e edges in edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above}) partition e edges = case Map.lookup e edges of Just subTree -> (subTree, Map.delete e edges) - _ -> (empty, Map.empty) + _ -> (empty, edges) type Indexed e n = ([e], n) diff --git a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs deleted file mode 100644 index e0d906f..0000000 --- a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs +++ /dev/null @@ -1,52 +0,0 @@ -{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} -module GEODE.Metadata.PrimaryKey.Paragraph - ( ParagraphPK(..) ) where - -import Data.Aeson ((.=), ToJSON(..), object, pairs) -import Data.Csv - ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord ) -import GEODE.Metadata.File (File(..)) -import GEODE.Metadata as Article - (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath) -import qualified GEODE.Metadata as Article (PrimaryKey(rank)) -import System.FilePath ((</>), (<.>), dropExtension) -import Text.Printf (printf) - -data ParagraphPK = ParagraphPK - { article :: PrimaryKey - , rank :: Int } deriving (Eq, Ord, Show) - -instance FromNamedRecord ParagraphPK where - parseNamedRecord nr = ParagraphPK - <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#") - <*> nr .: "paragraph#" - -instance ToNamedRecord ParagraphPK where - toNamedRecord (ParagraphPK {article, rank}) = namedRecord - [ namedField "book" (book article) - , namedField "tome" (tome article) - , namedField "article#" (Article.rank article) - , namedField "paragraph#" rank ] - -instance File ParagraphPK where - uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank - - relativePath (ParagraphPK {article, rank}) extension = - articleDirectory </> (show rank) <.> extension - where - articleDirectory = dropExtension (Article.relativePath article "") - -instance HasDefaultHeader ParagraphPK where - defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"]) - -instance ToJSON ParagraphPK where - toJSON (ParagraphPK {article, rank}) = object - [ "book" .= book article - , "tome" .= tome article - , "article#" .= Article.rank article - , "paragraph#" .= rank ] - toEncoding (ParagraphPK {article, rank}) = pairs - ( "book" .= book article - <> "tome" .= tome article - <> "article#" .= Article.rank article - <> "paragraph#" .= rank ) diff --git a/manifest.scm b/manifest.scm index bf7afd5..6d28e6e 100644 --- a/manifest.scm +++ b/manifest.scm @@ -1,34 +1,44 @@ (use-modules ((geode packages annotation) #:select (python-stanza)) + ;((geode packages encoding) #:select (ghc-geode)) ((geode packages models) #:select (stanza-fr)) + ((gnu packages base) #:select (findutils sed)) ((gnu packages commencement) #:select (gcc-toolchain)) ((gnu packages haskell) #:select (ghc)) ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) - ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu)) + ((gnu packages haskell-xyz) #:select (ghc-cassava + ghc-hs-conllu + ghc-random)) ((gnu packages python) #:select (python)) ((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-xyz) #:select (python-beautifulsoup4)) ((gnu packages xml) #:select (python-lxml))) ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) -;(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm")) -;(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm")) +(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm")) +(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm")) +(define processing-lge (load "/home/alice/Logiciel/ProcessingLGE/guix.scm")) +;(define soprano (load "/home/alice/Logiciel/soprano/guix.scm")) (packages->manifest (list coreutils ; mktemp for atomic processing, strip CSV headers, general scripting - ;edda-clinic ; fix and cut the EDdA + edda-clinic ; fix and cut the EDdA + findutils ; retrieve ALTO pages in files from the BnF gcc-toolchain ; running haskell ghc ; running haskell ghc-aeson ; working with JSON in haskell ghc-cassava ; working with CSV in haskell - ;ghc-geode ; handling corpus files + ghc-geode ; handling corpus files ghc-hs-conllu ; working on syntax-annotated documents ghc-hxt ; working on xml documents + ghc-random ; sampling data at random + processing-lge ; extracting articles from the BnF files python ; scripts python-beautifulsoup4 ; extract EDdA metadata from TEI files ;python-edda ; TODO python-lxml ; fusion articles into tomes for TXM python-pandas ; working with CSV in python python-stanza ; annotation + sed ; select files from listing stanza-fr ; annotation )) diff --git a/scripts/ML/predict.py b/scripts/ML/predict.py index e95b305..974fc69 100644 --- a/scripts/ML/predict.py +++ b/scripts/ML/predict.py @@ -120,4 +120,4 @@ def label(classify, source, tsv_path, name='label'): if __name__ == '__main__': classify = Classifier(argv[1]) source = Source(argv[2]) - label(classify, source, argv[3]).to_csv(argv[4], sep='\t') + label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False) diff --git a/scripts/subcorpus/getTXMQuery.hs b/scripts/subcorpus/getTXMQuery.hs index 854bf6a..93a5b77 100755 --- a/scripts/subcorpus/getTXMQuery.hs +++ b/scripts/subcorpus/getTXMQuery.hs @@ -1,17 +1,82 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" {-# LANGUAGE OverloadedStrings #-} -import Data.Text as Text (Text, pack, intercalate) +import Data.Map (Map, toList) +import Data.Metadata.Trie (Indexed, Trie(..), index) +import Data.Text as Text (Text, concat, cons, pack, intercalate, singleton, snoc) import Data.Text.IO as Text (putStr) -import GEODE.Metadata (PrimaryKey, uid, readNamedTsv) +import GEODE.Metadata (Book, PrimaryKey(..), readNamedTsv) import System.Environment (getArgs) import System.Script (syntax, try) -import Text.Printf (printf) -txmQuery :: Foldable t => t PrimaryKey -> Text -txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get) +data MetaPath = Book Book | TomeDigit Char | RankDigit Char | Separator deriving (Eq, Ord, Show) + +data RegExp = + Symbol Char + | Range (Char, Char) + | Union [RegExp] + | Concat [RegExp] + | Optional RegExp + +size :: RegExp -> Int +size (Symbol _) = 1 +size (Range _) = 5 +size (Union l) = length l + 1 + getSum $ foldMap (Sum . size) l +size (Concat l) = length l + 1 + getSum $ foldMap (Sum . size) l + +compile :: RegExp -> Text +compile (Symbol c) = Text.singleton c +compile (Range (a, b)) + | a > b = compile $ Range (b, a) + | otherwise = Text.concat ["[", Text.singleton a, "-", Text.singleton b, "]"] +compile Union rs = + between ('(', ')') . Text.intercalate "|" $ compile <$> rs +compile Concat rs = Text.concat rs +compile Optional c@(Concat _) = compile c <> "?" +compile Optional _ = compile r <> "?" + +pathIndexer :: PrimaryKey -> Indexed MetaPath PrimaryKey +pathIndexer pk@(PrimaryKey {book, tome, rank}) = + ( Book book : Separator : (TomeDigit <$> show tome) + ++ (Separator : (RankDigit <$> show rank)) + , pk ) + +between :: (Char, Char) -> Text -> Text +between (open, close) inside = open `cons` inside `snoc` close + +txmQuery :: (Foldable t, Functor t) => t PrimaryKey -> Text +txmQuery = queryOfTrie . index pathIndexer where - select = Text.pack . printf "a.text_uid=\"%s\"" . uid + queryOfTrie = + ("/region[text,a]:: a.text_uid=" <>) . between ('"', '"') . toRegex + +class Regexable a where + toRegex :: a -> RegExp + +instance Regexable e => Regexable (Trie e a) where + toRegex (Trie {store = Nothing, edges}) = toRegex edges + toRegex trie@(Trie {store = Just _, edges}) + | length edges == 0 = Concat [] + | otherwise = Optional (toRegex (trie {store = Nothing})) + +instance Regexable MetaPath where + toRegex (Book b) = Concat (Symbol <$> show b) + toRegex Separator = Symbol '_' + toRegex (TomeDigit c) = System c + toRegex (RankDigit c) = System c + +instance (Regexable a, Regexable e) => Regexable (Map e a) where + toRegex = toRegex . toList + +instance (Regexable a, Regexable e) => Regexable (e, a) where + toRegex (e, a) = Concat [toRegex e, toRegex a] + +{- +instance (Regexable a) => Regexable [a] where + toRegex [] = "" + toRegex [a] = toRegex a + toRegex l = between ('(', ')') . Text.intercalate "|" $ toRegex <$> l +-} main :: IO () main = getArgs >>= run diff --git a/scripts/textometry/scoresByDomain.hs b/scripts/textometry/scoresByDomain.hs index 226dc01..8c0c79b 100755 --- a/scripts/textometry/scoresByDomain.hs +++ b/scripts/textometry/scoresByDomain.hs @@ -2,18 +2,22 @@ {-# LANGUAGE OverloadedStrings #-} import Data.Text as Text - ( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace - , snoc, splitOn ) + ( Text, drop, dropEnd, intercalate, isPrefixOf, length, lines ) + --( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace + --, snoc, splitOn ) import Data.Text.IO as Text (getLine, interact, putStrLn) import System.Exit (die) +import Text.TSV (fromTsvLine, toTsvLine) data ColumnSelector = ColumnSelector { position :: Int , name :: Text } deriving Show newtype Header = Header [ColumnSelector] deriving Show -getColumns :: Text -> [Text] -getColumns = splitOn "\t" +{- +fromTsvLine :: Text -> [Text] +fromTsvLine = splitOn "\t" +-} getHeader :: [Text] -> IO Header getHeader [] = die "no columns" @@ -29,21 +33,23 @@ getHeader (first:otherColumns) = pure $ | otherwise = tmp score = "score_:" -toTsv :: [Text] -> Text -toTsv [] = "" -toTsv (first:otherColumns) = intercalate "\t" (escape first:otherColumns) +{- +toTsvLine :: [Text] -> Text +toTsvLine [] = "" +toTsvLine (first:otherColumns) = intercalate "\t" (escape first:otherColumns) where escape s = '"' `cons` replace "\"" "\"\"" s `snoc` '"' +-} filterLine :: [Int] -> Text -> Text -filterLine positions = toTsv . extract . getColumns +filterLine positions = toTsvLine . extract . fromTsvLine where extract columns = (columns !!) <$> positions main :: IO () main = do - Header header <- getHeader.getColumns =<< Text.getLine - Text.putStrLn . toTsv $ name <$> header + Header header <- getHeader.fromTsvLine =<< Text.getLine + Text.putStrLn . toTsvLine $ name <$> header Text.interact (textUnlines . map (filterLine $ position <$> header) . Text.lines) where -- GitLab