Skip to content
Snippets Groups Projects
Commit babaccfa authored by Alice Brenon's avatar Alice Brenon
Browse files

Work in progress on script for lexicoscope

parent 73734174
No related branches found
No related tags found
No related merge requests found
......@@ -9,23 +9,25 @@ import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.State (StateT(..), evalStateT)
import Data.List as List (intercalate)
--import Data.Map (toList)
import GEODE.Metadata (Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import Data.Text as Text (unpack)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>), (<.>))
import System.Script (syntax)
import System.Script (syntax, try)
import Text.XML.HXT.Core
((>>>), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, attr, mkelem, selem
, txt, writeDocument)
((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem
, txt, withIndent, writeDocument)
import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState)
import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState)
type Article = PrimaryKey @ Entry @ Contrastive
type Serial = (Int, Int)
type WithSerial a = StateT Serial IO a
type SerialArrow a = IOStateArrow Serial () a
next :: Selector Serial Int -> IOStateArrow Serial a String
next :: Selector Serial Int -> SerialArrow String
next intSelector = IOSLA getUpdate
where
getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
......@@ -37,10 +39,53 @@ pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
sId :: Selector Serial Int
sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
{-
to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
to source target (_, tome) =
mapM_ (processArticle source target) tome
-}
stateOfArrow :: IOStateArrow s () a -> StateT s IO ()
stateOfArrow a = StateT $ \s0 -> do
(s1, _) <- runIOSLA a (initialState s0) ()
pure ((), getS theUserState s1)
to :: FilePath -> FilePath -> (Book, [Article]) -> WithSerial ()
to source target (theBook, metadata) =
mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
where
runArrow (tomeNumber, tomeArticles) = stateOfArrow $
compileTome source target (theBook, tomeNumber, tomeArticles)
compileTome ::
FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow XmlTree
compileTome source target (theBook, tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output
where
strTome = show tomeNumber
output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml"
tomeXml = selem "/"
[ selem "corpus"
[ articleFrom source $< constL metadata ] ]
articleFrom :: FilePath -> Article -> SerialArrow XmlTree
articleFrom source article =
selem "doc"
[ selem "meta" [metaFrom article]
, selem "text" [loadConllu input] ]
--(formatParagraph <$> byParagraph dom) ]
where
input = source </> relativePath article "conllu"
loadConllu :: FilePath -> SerialArrow XmlTree
loadConllu input =
arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
where
debug msg = die $ "In file " <> input <> "\n" <> msg
format = formatParagraph $< arrL byParagraph
{-
processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
processArticle source target article =
--processArticle source target a@(Article {rank}) =
......@@ -63,10 +108,10 @@ format article outputPath dom = StateT $ \s -> do
[selem "doc"
[selem "meta" [metaFrom article]
,selem "text" (formatParagraph <$> byParagraph dom)]]]
-}
metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $
--metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $
concatMap (++ "\n")
(List.intercalate "\t" <$>
[[]
......@@ -75,13 +120,15 @@ metaFrom article = txt $
,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article]
,["domain", Text.unpack . formatList . domains $ get article]])
,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . subCorpus $ get article ]]
,["hydronym", show . ("hydronym" `elem`) . subCorpus $ get article ]])
formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree
formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> IOStateArrow Serial n XmlTree
formatSentence :: Sent -> SerialArrow XmlTree
formatSentence s =
mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where
......@@ -91,8 +138,8 @@ main :: IO ()
main = getArgs >>= cli
where
withSerial = (`evalStateT` (0, 0))
byTome = groupBy (tome.get) . sortBy uid
books = groupBy (book.get)
cli [sourceMeta, sourceRoot, targetRoot] =
readNamedTsv sourceMeta
>>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . byTome)
try (readNamedTsv sourceMeta)
>>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . books)
cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment