#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" {-# LANGUAGE NamedFieldPuns #-} module Main where import Data.BIO (Tree(..), bio) import Data.Metadata ( HasAuthors(..), Book(..), groupBy, readTsv, relativePath ) import Data.Metadata.Paragraph (Paragraph(..)) import Data.SpaCy (Annotation(..)) import Data.Text as Text (unpack) import System.Environment (getArgs) import System.Exit (die) import System.FilePath ((</>), (<.>)) import System.Script (syntax) import Text.TEI ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) import Text.XML.HXT.Core ( (>>>), (|||), ($<), IOSArrow, XmlTree, arr, arrIO, arrIO0, attr, constL , mkelem, mkText, runX, selem, txt, withIndent, writeDocument ) to :: FilePath -> FilePath -> (Int, [Paragraph]) -> IO () to source target = fmap (\_ -> ()) . runX . compileTome source target compileTome :: FilePath -> FilePath -> (Int, [Paragraph]) -> IOSArrow b XmlTree compileTome source target (tomeNumber, metadata) = tomeXml >>> writeDocument [withIndent True] output where strTome = show tomeNumber output = target </> "T" <> strTome <.> "xml" tomeXml = selem "/" [ selem "teiCorpus" [ corpusHeader EDdA strTome , (paragraphFrom source $< constL metadata) ] ] paragraphFrom :: FilePath -> Paragraph -> IOSArrow b XmlTree paragraphFrom source paragraph@(Paragraph {headWord, paragraphId}) = selem "TEI" [ teiHeader (selem "titleStmt" [title $ headWord <> " ยง" <> show paragraphId]) (publicationStmt [ selem "p" [ txt "Annotated with SpaCy by project GEODE" ] ]) (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) , text "paragraph" paragraph (loadTsv input) ] where input = source </> relativePath paragraph <.> "tsv" authorArrow = constL (Text.unpack <$> authors paragraph) loadTsv :: FilePath -> IOSArrow b XmlTree loadTsv input = arrIO0 (readTsv input) >>> ((arrIO die) ||| format) where format = toXml $< arr (bio namedEntity) toXml (Root nodes) = selem "body" [ toXml $< constL nodes ] toXml (Leaf (Annotation {lemma, pos, form})) = mkelem "w" [ attr "lemma" (txt lemma) , attr "pos" (txt pos) ] [ txt form ] toXml (Node namedEntity nodes) = mkelem "rs" [ attr "type" (txt namedEntity) ] [ toXml $< constL nodes ] main :: IO () main = getArgs >>= cli where cli [datasetPath, source, target] = readTsv datasetPath >>= either die (mapM_ (source `to` target) . groupBy tome) cli _ = syntax "DATASET_TSV SOURCE_DIR TARGET_DIR"