Skip to content
Snippets Groups Projects
Commit 1d6dee07 authored by Alice Brenon's avatar Alice Brenon
Browse files

Forgot to prevent pandas from adding the annoying index column

parent a12c79c3
Branches
No related tags found
No related merge requests found
{-# LANGUAGE NamedFieldPuns #-}
module Data.Metadata.Trie
() where
--( Edges
--, Indexed
--, Trie(..)
--, Zipper(..)
--, at
--, index
--, trieOf ) where
where
import Data.Foldable (toList)
import Data.Map (Map)
......@@ -9,7 +16,7 @@ import qualified Data.Map as Map (delete, empty, insert, lookup, singleton)
data Trie e n =
Trie
{ store :: Maybe n
, edges :: Edges e n }
, edges :: Edges e n } deriving Show
type Edges e n = Map e (Trie e n)
data Zipper e n =
......@@ -46,14 +53,14 @@ at trie = at_ trie . toList
insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n
insert path n trie = zipUp (edit (toList path) (trie, Top))
where
edit [] (trie, zipper) = (trie {store = Just n}, zipper)
edit [] (subTrie, zipper) = (subTrie {store = Just n}, zipper)
edit (e:es) (Trie {store, edges}, above) =
let (subTree, otherEdges) = partition e edges in
edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above})
partition e edges =
case Map.lookup e edges of
Just subTree -> (subTree, Map.delete e edges)
_ -> (empty, Map.empty)
_ -> (empty, edges)
type Indexed e n = ([e], n)
......
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module GEODE.Metadata.PrimaryKey.Paragraph
( ParagraphPK(..) ) where
import Data.Aeson ((.=), ToJSON(..), object, pairs)
import Data.Csv
( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
import GEODE.Metadata.File (File(..))
import GEODE.Metadata as Article
(DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
import qualified GEODE.Metadata as Article (PrimaryKey(rank))
import System.FilePath ((</>), (<.>), dropExtension)
import Text.Printf (printf)
data ParagraphPK = ParagraphPK
{ article :: PrimaryKey
, rank :: Int } deriving (Eq, Ord, Show)
instance FromNamedRecord ParagraphPK where
parseNamedRecord nr = ParagraphPK
<$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
<*> nr .: "paragraph#"
instance ToNamedRecord ParagraphPK where
toNamedRecord (ParagraphPK {article, rank}) = namedRecord
[ namedField "book" (book article)
, namedField "tome" (tome article)
, namedField "article#" (Article.rank article)
, namedField "paragraph#" rank ]
instance File ParagraphPK where
uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
relativePath (ParagraphPK {article, rank}) extension =
articleDirectory </> (show rank) <.> extension
where
articleDirectory = dropExtension (Article.relativePath article "")
instance HasDefaultHeader ParagraphPK where
defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
instance ToJSON ParagraphPK where
toJSON (ParagraphPK {article, rank}) = object
[ "book" .= book article
, "tome" .= tome article
, "article#" .= Article.rank article
, "paragraph#" .= rank ]
toEncoding (ParagraphPK {article, rank}) = pairs
( "book" .= book article
<> "tome" .= tome article
<> "article#" .= Article.rank article
<> "paragraph#" .= rank )
(use-modules ((geode packages annotation) #:select (python-stanza))
;((geode packages encoding) #:select (ghc-geode))
((geode packages models) #:select (stanza-fr))
((gnu packages base) #:select (findutils sed))
((gnu packages commencement) #:select (gcc-toolchain))
((gnu packages haskell) #:select (ghc))
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-hs-conllu
ghc-random))
((gnu packages python) #:select (python))
((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-beautifulsoup4))
((gnu packages xml) #:select (python-lxml)))
;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
;(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
;(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm"))
(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm"))
(define processing-lge (load "/home/alice/Logiciel/ProcessingLGE/guix.scm"))
;(define soprano (load "/home/alice/Logiciel/soprano/guix.scm"))
(packages->manifest
(list
coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
;edda-clinic ; fix and cut the EDdA
edda-clinic ; fix and cut the EDdA
findutils ; retrieve ALTO pages in files from the BnF
gcc-toolchain ; running haskell
ghc ; running haskell
ghc-aeson ; working with JSON in haskell
ghc-cassava ; working with CSV in haskell
;ghc-geode ; handling corpus files
ghc-geode ; handling corpus files
ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents
ghc-random ; sampling data at random
processing-lge ; extracting articles from the BnF files
python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files
;python-edda ; TODO
python-lxml ; fusion articles into tomes for TXM
python-pandas ; working with CSV in python
python-stanza ; annotation
sed ; select files from listing
stanza-fr ; annotation
))
......@@ -120,4 +120,4 @@ def label(classify, source, tsv_path, name='label'):
if __name__ == '__main__':
classify = Classifier(argv[1])
source = Source(argv[2])
label(classify, source, argv[3]).to_csv(argv[4], sep='\t')
label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE OverloadedStrings #-}
import Data.Text as Text (Text, pack, intercalate)
import Data.Map (Map, toList)
import Data.Metadata.Trie (Indexed, Trie(..), index)
import Data.Text as Text (Text, concat, cons, pack, intercalate, singleton, snoc)
import Data.Text.IO as Text (putStr)
import GEODE.Metadata (PrimaryKey, uid, readNamedTsv)
import GEODE.Metadata (Book, PrimaryKey(..), readNamedTsv)
import System.Environment (getArgs)
import System.Script (syntax, try)
import Text.Printf (printf)
txmQuery :: Foldable t => t PrimaryKey -> Text
txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get)
data MetaPath = Book Book | TomeDigit Char | RankDigit Char | Separator deriving (Eq, Ord, Show)
data RegExp =
Symbol Char
| Range (Char, Char)
| Union [RegExp]
| Concat [RegExp]
| Optional RegExp
size :: RegExp -> Int
size (Symbol _) = 1
size (Range _) = 5
size (Union l) = length l + 1 + getSum $ foldMap (Sum . size) l
size (Concat l) = length l + 1 + getSum $ foldMap (Sum . size) l
compile :: RegExp -> Text
compile (Symbol c) = Text.singleton c
compile (Range (a, b))
| a > b = compile $ Range (b, a)
| otherwise = Text.concat ["[", Text.singleton a, "-", Text.singleton b, "]"]
compile Union rs =
between ('(', ')') . Text.intercalate "|" $ compile <$> rs
compile Concat rs = Text.concat rs
compile Optional c@(Concat _) = compile c <> "?"
compile Optional _ = compile r <> "?"
pathIndexer :: PrimaryKey -> Indexed MetaPath PrimaryKey
pathIndexer pk@(PrimaryKey {book, tome, rank}) =
( Book book : Separator : (TomeDigit <$> show tome)
++ (Separator : (RankDigit <$> show rank))
, pk )
between :: (Char, Char) -> Text -> Text
between (open, close) inside = open `cons` inside `snoc` close
txmQuery :: (Foldable t, Functor t) => t PrimaryKey -> Text
txmQuery = queryOfTrie . index pathIndexer
where
select = Text.pack . printf "a.text_uid=\"%s\"" . uid
queryOfTrie =
("/region[text,a]:: a.text_uid=" <>) . between ('"', '"') . toRegex
class Regexable a where
toRegex :: a -> RegExp
instance Regexable e => Regexable (Trie e a) where
toRegex (Trie {store = Nothing, edges}) = toRegex edges
toRegex trie@(Trie {store = Just _, edges})
| length edges == 0 = Concat []
| otherwise = Optional (toRegex (trie {store = Nothing}))
instance Regexable MetaPath where
toRegex (Book b) = Concat (Symbol <$> show b)
toRegex Separator = Symbol '_'
toRegex (TomeDigit c) = System c
toRegex (RankDigit c) = System c
instance (Regexable a, Regexable e) => Regexable (Map e a) where
toRegex = toRegex . toList
instance (Regexable a, Regexable e) => Regexable (e, a) where
toRegex (e, a) = Concat [toRegex e, toRegex a]
{-
instance (Regexable a) => Regexable [a] where
toRegex [] = ""
toRegex [a] = toRegex a
toRegex l = between ('(', ')') . Text.intercalate "|" $ toRegex <$> l
-}
main :: IO ()
main = getArgs >>= run
......
......@@ -2,18 +2,22 @@
{-# LANGUAGE OverloadedStrings #-}
import Data.Text as Text
( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
, snoc, splitOn )
( Text, drop, dropEnd, intercalate, isPrefixOf, length, lines )
--( Text, cons, drop, dropEnd, intercalate, isPrefixOf, length, lines, replace
--, snoc, splitOn )
import Data.Text.IO as Text (getLine, interact, putStrLn)
import System.Exit (die)
import Text.TSV (fromTsvLine, toTsvLine)
data ColumnSelector = ColumnSelector
{ position :: Int
, name :: Text } deriving Show
newtype Header = Header [ColumnSelector] deriving Show
getColumns :: Text -> [Text]
getColumns = splitOn "\t"
{-
fromTsvLine :: Text -> [Text]
fromTsvLine = splitOn "\t"
-}
getHeader :: [Text] -> IO Header
getHeader [] = die "no columns"
......@@ -29,21 +33,23 @@ getHeader (first:otherColumns) = pure $
| otherwise = tmp
score = "score_:"
toTsv :: [Text] -> Text
toTsv [] = ""
toTsv (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
{-
toTsvLine :: [Text] -> Text
toTsvLine [] = ""
toTsvLine (first:otherColumns) = intercalate "\t" (escape first:otherColumns)
where
escape s = '"' `cons` replace "\"" "\"\"" s `snoc` '"'
-}
filterLine :: [Int] -> Text -> Text
filterLine positions = toTsv . extract . getColumns
filterLine positions = toTsvLine . extract . fromTsvLine
where
extract columns = (columns !!) <$> positions
main :: IO ()
main = do
Header header <- getHeader.getColumns =<< Text.getLine
Text.putStrLn . toTsv $ name <$> header
Header header <- getHeader.fromTsvLine =<< Text.getLine
Text.putStrLn . toTsvLine $ name <$> header
Text.interact
(textUnlines . map (filterLine $ position <$> header) . Text.lines)
where
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment