Skip to content
Snippets Groups Projects
Commit 34835a05 authored by Alice Brenon's avatar Alice Brenon
Browse files

Fixing lexicoscope script quick'n'dirty to get things rolling

parent ec41aafb
No related branches found
No related tags found
No related merge requests found
......@@ -4,12 +4,13 @@
import Conllu.Parse (parseConllu)
import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
import Conllu.Print (printSent)
import Conllu.Type (Doc, Sent(..))
import Control.Monad.IO.Class (MonadIO(..))
import Conllu.Type (Sent(..))
--import Conllu.Type (Doc, Sent(..))
--import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.State (StateT(..), evalStateT)
import Data.List as List (intercalate)
--import Data.Map (toList)
import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import Data.Text as Text (unpack)
import System.Environment (getArgs)
import System.Exit (die)
......@@ -25,9 +26,10 @@ type Article = PrimaryKey @ Entry @ Contrastive
type Serial = (Int, Int)
type WithSerial a = StateT Serial IO a
type SerialArrow a = IOStateArrow Serial () a
type SerialArrow n a = IOStateArrow Serial n a
--type SerialArrow a = IOStateArrow Serial () a
next :: Selector Serial Int -> SerialArrow String
next :: Selector Serial Int -> SerialArrow n String
next intSelector = IOSLA getUpdate
where
getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
......@@ -36,8 +38,10 @@ next intSelector = IOSLA getUpdate
pId :: Selector Serial Int
pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
{-
sId :: Selector Serial Int
sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
-}
{-
to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
......@@ -58,17 +62,18 @@ to source target (theBook, metadata) =
compileTome source target (theBook, tomeNumber, tomeArticles)
compileTome ::
FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow XmlTree
FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow n XmlTree
compileTome source target (theBook, tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output
where
strTome = show tomeNumber
output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml"
tomeXml = selem "/"
[ selem "corpus"
[ selem "teiCorpus"
--[ selem "corpus"
[ articleFrom source $< constL metadata ] ]
articleFrom :: FilePath -> Article -> SerialArrow XmlTree
articleFrom :: FilePath -> Article -> SerialArrow n XmlTree
articleFrom source article =
selem "doc"
[ selem "meta" [metaFrom article]
......@@ -78,13 +83,48 @@ articleFrom source article =
where
input = source </> relativePath article "conllu"
loadConllu :: FilePath -> SerialArrow XmlTree
loadConllu :: FilePath -> SerialArrow n XmlTree
loadConllu input =
arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
where
debug msg = die $ "In file " <> input <> "\n" <> msg
format = formatParagraph $< arrL byParagraph
metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $
concatMap (++ "\n")
(List.intercalate "\t" <$>
[[]
,["fileName", uid article]
,["tome", show . tome $ get article]
,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article]
,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get article ]
,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get article ]])
{-
formatSentence :: IOStateArrow Serial Sent XmlTree
formatSentence :: IOStateArrow Serial Sent XmlTree
formatSentence =
selem "s" [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
-}
formatParagraph :: Paragraph -> SerialArrow n XmlTree
--formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> SerialArrow n XmlTree
formatSentence s =
selem "s" [txt sentences]
--mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
{-
processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
processArticle source target article =
......@@ -110,30 +150,6 @@ format article outputPath dom = StateT $ \s -> do
,selem "text" (formatParagraph <$> byParagraph dom)]]]
-}
metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $
concatMap (++ "\n")
(List.intercalate "\t" <$>
[[]
,["fileName", uid article]
,["tome", show . tome $ get article]
,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article]
,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . subCorpus $ get article ]]
,["hydronym", show . ("hydronym" `elem`) . subCorpus $ get article ]])
formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> SerialArrow XmlTree
formatSentence s =
mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
main :: IO ()
main = getArgs >>= cli
where
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment