Skip to content
Snippets Groups Projects
spacy-to-txm.hs 2.56 KiB
Newer Older
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE NamedFieldPuns #-}
module Main where

import Data.BIO (Tree(..), bio)
import Data.Metadata
  ( HasAuthors(..), Book(..), groupBy, readTsv, relativePath )
import Data.Metadata.Paragraph (Paragraph(..))
import Data.SpaCy (Annotation(..))
import Data.Text as Text (unpack)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>), (<.>))
import System.Script (syntax)
import Text.TEI
  ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
import Text.XML.HXT.Core
  ( (>>>), (|||), ($<), IOSArrow, XmlTree, arr, arrIO, arrIO0, attr, constL
  , mkelem, mkText, runX, selem, txt, withIndent, writeDocument )

to :: FilePath -> FilePath -> (Int, [Paragraph]) -> IO ()
to source target = fmap (\_ -> ()) . runX . compileTome source target

compileTome :: FilePath -> FilePath -> (Int, [Paragraph]) -> IOSArrow b XmlTree
compileTome source target (tomeNumber, metadata) =
  tomeXml >>> writeDocument [withIndent True] output
  where
    strTome = show tomeNumber
    output = target </> "T" <> strTome <.> "xml"
    tomeXml = selem "/"
                [ selem "teiCorpus"
                  [ corpusHeader EDdA strTome
                  , (paragraphFrom source $< constL metadata) ] ]

paragraphFrom :: FilePath -> Paragraph -> IOSArrow b XmlTree
paragraphFrom source paragraph@(Paragraph {headWord, paragraphId}) =
  selem "TEI"
    [ teiHeader
        (selem "titleStmt" [title $ headWord <> " §" <> show paragraphId])
        (publicationStmt
          [ selem "p" [ txt "Annotated with SpaCy by project GEODE" ] ])
        (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
    , text "paragraph" paragraph (loadTsv input) ]
  where
    input = source </> relativePath paragraph <.> "tsv"
    authorArrow = constL (Text.unpack <$> authors paragraph)

loadTsv :: FilePath -> IOSArrow b XmlTree
loadTsv input = arrIO0 (readTsv input) >>> ((arrIO die) ||| format)
  where
    format = toXml $< arr (bio namedEntity)
    toXml (Root nodes) = selem "body" [ toXml $< constL nodes ]
    toXml (Leaf (Annotation {lemma, pos, form})) =
      mkelem "w" [ attr "lemma" (txt lemma)
                 , attr "pos" (txt pos) ]
                 [ txt form ]
    toXml (Node namedEntity nodes) =
      mkelem "rs" [ attr "type" (txt namedEntity) ] [ toXml $< constL nodes ]

main :: IO ()
main = getArgs >>= cli
  where
    cli [datasetPath, source, target] = readTsv datasetPath
      >>= either die (mapM_ (source `to` target) . groupBy tome)
    cli _ = syntax "DATASET_TSV SOURCE_DIR TARGET_DIR"