#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" {-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} import Conllu.Parse (parseConllu) import Conllu.Type (AW, CW(..), ID(..), Sent(..)) import Control.Applicative ((<|>), liftA2) import GEODE.Metadata ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..) , type(@), relativePath, groupBy, readNamedTsv, sortBy ) import Data.Text as Text (breakOn, drop, pack, unpack, splitOn) import System.Environment (getArgs) import System.Exit (die) import System.FilePath ((</>), (<.>)) import System.Script (syntax, try) import Text.TEI ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) import Text.XML.HXT.Core ( (>>>), (|||), ($<), IOSLA(..), IOSArrow, XmlTree, arr, arrIO, arrIO0, arrL , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent , writeDocument ) type Article = PrimaryKey @ Entry @ Contrastive to :: FilePath -> FilePath -> (Book, [Article]) -> IO () to source target (theBook, metadata) = mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata where runArrow (tomeNumber, tomeArticles) = runX $ compileTome source target (theBook, tomeNumber, tomeArticles) compileTome :: FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree compileTome source target (theBook, tomeNumber, metadata) = tomeXml >>> writeDocument [withIndent True] output where strTome = show tomeNumber output = target </> (show theBook) <> "_T" <> strTome <.> "xml" tomeXml = selem "/" [ selem "teiCorpus" [ corpusHeader theBook strTome , (articleFrom source $< constL metadata) ] ] articleFrom :: FilePath -> Article -> IOSArrow b XmlTree articleFrom source article = selem "TEI" [ teiHeader (selem "titleStmt" [title . Text.unpack . headWord.get $ article]) (publicationStmt [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ]) (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) , text "article" article (loadConllu input) ] where input = source </> relativePath article "conllu" authorArrow = constL (Text.unpack <$> (getList . authors $ get article)) loadConllu :: FilePath -> IOSArrow b XmlTree loadConllu input = arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format) where debug msg = die $ "In file " <> input <> "\n" <> msg format = selem "body" [ unlistA >>> formatSentence ] formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ] formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma) , attr "pos" (txt $< getMaybe pos) , attr "type" (txt $< getMaybe misc) ] [ txt $< getMaybe _form ] pos = liftA2 (<|>) spos _xpos getMaybe :: (a -> Maybe b) -> IOSArrow a b getMaybe projector = IOSLA $ \s a -> pure (s, maybe [] (:[]) $ projector a) render :: [CW AW] -> [CW AW] render [] = [] render (w0@(CW {_id = MID a0 b0}):w1@(CW {_id = SID a1}):w2@(CW {_id = SID b2}):ws) | a0 == a1 && b0 == b2 = w0 {_lemma = glue _lemma, _upos = Nothing, _xpos = glue spos}:(render ws) where glue f = (\a b -> a ++ "+" ++ b) <$> (f w1) <*> (f w2) render (w:ws) = w:(render ws) spos :: CW a -> Maybe String spos = fmap show . _upos misc :: CW a -> Maybe String misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack) where getNer = lookup "ner" . fmap (fmap (Text.drop 1) . breakOn "=") . splitOn "|" main :: IO () main = getArgs >>= cli where books = groupBy (book.get) cli [inputMeta, sourceRoot, targetRoot] = try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"