Skip to content
Snippets Groups Projects
conllu-to-txm.hs 3.77 KiB
Newer Older
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-}

import Conllu.Parse (parseConllu)
import Conllu.Type (AW, CW(..), ID(..), Sent(..))
import Control.Applicative ((<|>), liftA2)
import GEODE.Metadata
  ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..)
  , type(@), relativePath, groupBy, readNamedTsv, sortBy )
import Data.Text as Text (breakOn, drop, pack, unpack, splitOn)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>), (<.>))
import System.Script (syntax, try)
import Text.TEI
  ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
import Text.XML.HXT.Core
  ( (>>>), (|||), ($<), IOSLA(..), IOSArrow, XmlTree, arr, arrIO, arrIO0, arrL
  , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent
  , writeDocument )

type Article = PrimaryKey @ Entry @ Contrastive

to :: FilePath -> FilePath -> (Book, [Article]) -> IO ()
to source target (theBook, metadata) =
  mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata
  where
    runArrow (tomeNumber, tomeArticles) =
      runX $ compileTome source target (theBook, tomeNumber, tomeArticles)

compileTome ::
  FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree
compileTome source target (theBook, tomeNumber, metadata) =
  tomeXml >>> writeDocument [withIndent True] output
  where
    strTome = show tomeNumber
    output = target </> (show theBook) <> "_T" <> strTome <.> "xml"
    tomeXml = selem "/"
                [ selem "teiCorpus"
                  [ corpusHeader theBook strTome
                  , (articleFrom source $< constL metadata) ] ]

articleFrom :: FilePath -> Article -> IOSArrow b XmlTree
articleFrom source article =
  selem "TEI"
    [ teiHeader
        (selem "titleStmt" [title . Text.unpack . headWord.get $ article])
        (publicationStmt
          [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ])
        (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
    , text "article" article (loadConllu input) ]
  where
    input = source </> relativePath article "conllu"
    authorArrow = constL (Text.unpack <$> (getList . authors $ get article))

loadConllu :: FilePath -> IOSArrow b XmlTree
loadConllu input =
  arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format)
    debug msg = die $ "In file " <> input <> "\n" <> msg
    format = selem "body" [ unlistA >>> formatSentence ]
    formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ]
    formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma)
                            , attr "pos" (txt $< getMaybe pos)
                            , attr "type" (txt $< getMaybe misc) ]
                            [ txt $< getMaybe _form ]
    pos = liftA2 (<|>) spos _xpos

getMaybe :: (a -> Maybe b) -> IOSArrow a b
getMaybe projector = IOSLA $ \s a -> pure (s, maybe [] (:[]) $ projector a)

render :: [CW AW] -> [CW AW]
render [] = []
render (w0@(CW {_id = MID a0 b0}):w1@(CW {_id = SID a1}):w2@(CW {_id = SID b2}):ws)
  | a0 == a1 && b0 == b2 =
    w0 {_lemma = glue _lemma, _upos = Nothing, _xpos = glue spos}:(render ws)
  where
    glue f = (\a b -> a ++ "+" ++ b) <$> (f w1) <*> (f w2)
render (w:ws) = w:(render ws)

spos :: CW a -> Maybe String
spos = fmap show . _upos

misc :: CW a -> Maybe String
misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack)
  where
    getNer = lookup "ner" . fmap (fmap (Text.drop 1) . breakOn "=") . splitOn "|"

main :: IO ()
main = getArgs >>= cli
  where
    books = groupBy (book.get)
    cli [inputMeta, sourceRoot, targetRoot] =
      try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books
    cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"