Skip to content
Snippets Groups Projects
Commit babaccfa authored by Alice Brenon's avatar Alice Brenon
Browse files

Work in progress on script for lexicoscope

parent 73734174
No related branches found
No related tags found
No related merge requests found
...@@ -9,23 +9,25 @@ import Control.Monad.IO.Class (MonadIO(..)) ...@@ -9,23 +9,25 @@ import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.State (StateT(..), evalStateT) import Control.Monad.State (StateT(..), evalStateT)
import Data.List as List (intercalate) import Data.List as List (intercalate)
--import Data.Map (toList) --import Data.Map (toList)
import GEODE.Metadata (Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid) import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import Data.Text as Text (unpack) import Data.Text as Text (unpack)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
import System.FilePath ((</>), (<.>)) import System.FilePath ((</>), (<.>))
import System.Script (syntax) import System.Script (syntax, try)
import Text.XML.HXT.Core import Text.XML.HXT.Core
((>>>), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, attr, mkelem, selem ((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem
, txt, writeDocument) , txt, withIndent, writeDocument)
import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState) import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState) import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
type Article = PrimaryKey @ Entry @ Contrastive type Article = PrimaryKey @ Entry @ Contrastive
type Serial = (Int, Int) type Serial = (Int, Int)
type WithSerial a = StateT Serial IO a
type SerialArrow a = IOStateArrow Serial () a
next :: Selector Serial Int -> IOStateArrow Serial a String next :: Selector Serial Int -> SerialArrow String
next intSelector = IOSLA getUpdate next intSelector = IOSLA getUpdate
where where
getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state]) getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
...@@ -37,10 +39,53 @@ pId = S {getS = fst, setS = \p (_, s) -> (p, s)} ...@@ -37,10 +39,53 @@ pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
sId :: Selector Serial Int sId :: Selector Serial Int
sId = S {getS = snd, setS = \s (p, _) -> (p, s)} sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
{-
to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO () to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
to source target (_, tome) = to source target (_, tome) =
mapM_ (processArticle source target) tome mapM_ (processArticle source target) tome
-}
stateOfArrow :: IOStateArrow s () a -> StateT s IO ()
stateOfArrow a = StateT $ \s0 -> do
(s1, _) <- runIOSLA a (initialState s0) ()
pure ((), getS theUserState s1)
to :: FilePath -> FilePath -> (Book, [Article]) -> WithSerial ()
to source target (theBook, metadata) =
mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
where
runArrow (tomeNumber, tomeArticles) = stateOfArrow $
compileTome source target (theBook, tomeNumber, tomeArticles)
compileTome ::
FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow XmlTree
compileTome source target (theBook, tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output
where
strTome = show tomeNumber
output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml"
tomeXml = selem "/"
[ selem "corpus"
[ articleFrom source $< constL metadata ] ]
articleFrom :: FilePath -> Article -> SerialArrow XmlTree
articleFrom source article =
selem "doc"
[ selem "meta" [metaFrom article]
, selem "text" [loadConllu input] ]
--(formatParagraph <$> byParagraph dom) ]
where
input = source </> relativePath article "conllu"
loadConllu :: FilePath -> SerialArrow XmlTree
loadConllu input =
arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
where
debug msg = die $ "In file " <> input <> "\n" <> msg
format = formatParagraph $< arrL byParagraph
{-
processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO () processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
processArticle source target article = processArticle source target article =
--processArticle source target a@(Article {rank}) = --processArticle source target a@(Article {rank}) =
...@@ -63,10 +108,10 @@ format article outputPath dom = StateT $ \s -> do ...@@ -63,10 +108,10 @@ format article outputPath dom = StateT $ \s -> do
[selem "doc" [selem "doc"
[selem "meta" [metaFrom article] [selem "meta" [metaFrom article]
,selem "text" (formatParagraph <$> byParagraph dom)]]] ,selem "text" (formatParagraph <$> byParagraph dom)]]]
-}
metaFrom :: ArrowXml a => Article -> a n XmlTree metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $ metaFrom article = txt $
--metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $
concatMap (++ "\n") concatMap (++ "\n")
(List.intercalate "\t" <$> (List.intercalate "\t" <$>
[[] [[]
...@@ -75,13 +120,15 @@ metaFrom article = txt $ ...@@ -75,13 +120,15 @@ metaFrom article = txt $
,["rank", show . rank $ get article] ,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article] ,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article] ,["author", Text.unpack . formatList . authors $ get article]
,["domain", Text.unpack . formatList . domains $ get article]]) ,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . subCorpus $ get article ]]
,["hydronym", show . ("hydronym" `elem`) . subCorpus $ get article ]])
formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) = formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents) mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> IOStateArrow Serial n XmlTree formatSentence :: Sent -> SerialArrow XmlTree
formatSentence s = formatSentence s =
mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences] mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where where
...@@ -91,8 +138,8 @@ main :: IO () ...@@ -91,8 +138,8 @@ main :: IO ()
main = getArgs >>= cli main = getArgs >>= cli
where where
withSerial = (`evalStateT` (0, 0)) withSerial = (`evalStateT` (0, 0))
byTome = groupBy (tome.get) . sortBy uid books = groupBy (book.get)
cli [sourceMeta, sourceRoot, targetRoot] = cli [sourceMeta, sourceRoot, targetRoot] =
readNamedTsv sourceMeta try (readNamedTsv sourceMeta)
>>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . byTome) >>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . books)
cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment