Skip to content
Snippets Groups Projects
Commit 34835a05 authored by Alice Brenon's avatar Alice Brenon
Browse files

Fixing lexicoscope script quick'n'dirty to get things rolling

parent ec41aafb
No related branches found
No related tags found
No related merge requests found
...@@ -4,12 +4,13 @@ ...@@ -4,12 +4,13 @@
import Conllu.Parse (parseConllu) import Conllu.Parse (parseConllu)
import Conllu.Parse.Paragraph (Paragraph(..), byParagraph) import Conllu.Parse.Paragraph (Paragraph(..), byParagraph)
import Conllu.Print (printSent) import Conllu.Print (printSent)
import Conllu.Type (Doc, Sent(..)) import Conllu.Type (Sent(..))
import Control.Monad.IO.Class (MonadIO(..)) --import Conllu.Type (Doc, Sent(..))
--import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.State (StateT(..), evalStateT) import Control.Monad.State (StateT(..), evalStateT)
import Data.List as List (intercalate) import Data.List as List (intercalate)
--import Data.Map (toList) --import Data.Map (toList)
import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid) import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid)
import Data.Text as Text (unpack) import Data.Text as Text (unpack)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
...@@ -25,9 +26,10 @@ type Article = PrimaryKey @ Entry @ Contrastive ...@@ -25,9 +26,10 @@ type Article = PrimaryKey @ Entry @ Contrastive
type Serial = (Int, Int) type Serial = (Int, Int)
type WithSerial a = StateT Serial IO a type WithSerial a = StateT Serial IO a
type SerialArrow a = IOStateArrow Serial () a type SerialArrow n a = IOStateArrow Serial n a
--type SerialArrow a = IOStateArrow Serial () a
next :: Selector Serial Int -> SerialArrow String next :: Selector Serial Int -> SerialArrow n String
next intSelector = IOSLA getUpdate next intSelector = IOSLA getUpdate
where where
getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state]) getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state])
...@@ -36,8 +38,10 @@ next intSelector = IOSLA getUpdate ...@@ -36,8 +38,10 @@ next intSelector = IOSLA getUpdate
pId :: Selector Serial Int pId :: Selector Serial Int
pId = S {getS = fst, setS = \p (_, s) -> (p, s)} pId = S {getS = fst, setS = \p (_, s) -> (p, s)}
{-
sId :: Selector Serial Int sId :: Selector Serial Int
sId = S {getS = snd, setS = \s (p, _) -> (p, s)} sId = S {getS = snd, setS = \s (p, _) -> (p, s)}
-}
{- {-
to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO () to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO ()
...@@ -58,17 +62,18 @@ to source target (theBook, metadata) = ...@@ -58,17 +62,18 @@ to source target (theBook, metadata) =
compileTome source target (theBook, tomeNumber, tomeArticles) compileTome source target (theBook, tomeNumber, tomeArticles)
compileTome :: compileTome ::
FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow XmlTree FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow n XmlTree
compileTome source target (theBook, tomeNumber, metadata) = compileTome source target (theBook, tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output tomeXml >>> writeDocument [withIndent True] output
where where
strTome = show tomeNumber strTome = show tomeNumber
output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml" output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml"
tomeXml = selem "/" tomeXml = selem "/"
[ selem "corpus" [ selem "teiCorpus"
--[ selem "corpus"
[ articleFrom source $< constL metadata ] ] [ articleFrom source $< constL metadata ] ]
articleFrom :: FilePath -> Article -> SerialArrow XmlTree articleFrom :: FilePath -> Article -> SerialArrow n XmlTree
articleFrom source article = articleFrom source article =
selem "doc" selem "doc"
[ selem "meta" [metaFrom article] [ selem "meta" [metaFrom article]
...@@ -78,13 +83,48 @@ articleFrom source article = ...@@ -78,13 +83,48 @@ articleFrom source article =
where where
input = source </> relativePath article "conllu" input = source </> relativePath article "conllu"
loadConllu :: FilePath -> SerialArrow XmlTree loadConllu :: FilePath -> SerialArrow n XmlTree
loadConllu input = loadConllu input =
arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format) arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
where where
debug msg = die $ "In file " <> input <> "\n" <> msg debug msg = die $ "In file " <> input <> "\n" <> msg
format = formatParagraph $< arrL byParagraph format = formatParagraph $< arrL byParagraph
metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $
concatMap (++ "\n")
(List.intercalate "\t" <$>
[[]
,["fileName", uid article]
,["tome", show . tome $ get article]
,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article]
,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get article ]
,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get article ]])
{-
formatSentence :: IOStateArrow Serial Sent XmlTree
formatSentence :: IOStateArrow Serial Sent XmlTree
formatSentence =
selem "s" [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
-}
formatParagraph :: Paragraph -> SerialArrow n XmlTree
--formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> SerialArrow n XmlTree
formatSentence s =
selem "s" [txt sentences]
--mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
{- {-
processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO () processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO ()
processArticle source target article = processArticle source target article =
...@@ -110,30 +150,6 @@ format article outputPath dom = StateT $ \s -> do ...@@ -110,30 +150,6 @@ format article outputPath dom = StateT $ \s -> do
,selem "text" (formatParagraph <$> byParagraph dom)]]] ,selem "text" (formatParagraph <$> byParagraph dom)]]]
-} -}
metaFrom :: ArrowXml a => Article -> a n XmlTree
metaFrom article = txt $
concatMap (++ "\n")
(List.intercalate "\t" <$>
[[]
,["fileName", uid article]
,["tome", show . tome $ get article]
,["rank", show . rank $ get article]
,["head", Text.unpack . headWord $ get article]
,["author", Text.unpack . formatList . authors $ get article]
,["domains", Text.unpack . formatList . domains $ get article]
,["parallel", show . ("parallel" `elem`) . subCorpus $ get article ]]
,["hydronym", show . ("hydronym" `elem`) . subCorpus $ get article ]])
formatParagraph :: Paragraph -> SerialArrow XmlTree
formatParagraph (Paragraph sents) =
mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents)
formatSentence :: Sent -> SerialArrow XmlTree
formatSentence s =
mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences]
where
sentences = dropWhile (== '\n') $ printSent (s {_meta = []})
main :: IO () main :: IO ()
main = getArgs >>= cli main = getArgs >>= cli
where where
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment