Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE NamedFieldPuns #-}
module Main where
import Data.BIO (Tree(..), bio)
import Data.Metadata
( HasAuthors(..), Book(..), groupBy, readTsv, relativePath )
import Data.Metadata.Paragraph (Paragraph(..))
import Data.SpaCy (Annotation(..))
import Data.Text as Text (unpack)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>), (<.>))
import System.Script (syntax)
import Text.TEI
( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
import Text.XML.HXT.Core
( (>>>), (|||), ($<), IOSArrow, XmlTree, arr, arrIO, arrIO0, attr, constL
, mkelem, mkText, runX, selem, txt, withIndent, writeDocument )
to :: FilePath -> FilePath -> (Int, [Paragraph]) -> IO ()
to source target = fmap (\_ -> ()) . runX . compileTome source target
compileTome :: FilePath -> FilePath -> (Int, [Paragraph]) -> IOSArrow b XmlTree
compileTome source target (tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output
where
strTome = show tomeNumber
output = target </> "T" <> strTome <.> "xml"
tomeXml = selem "/"
[ selem "teiCorpus"
[ corpusHeader EDdA strTome
, (paragraphFrom source $< constL metadata) ] ]
paragraphFrom :: FilePath -> Paragraph -> IOSArrow b XmlTree
paragraphFrom source paragraph@(Paragraph {headWord, paragraphId}) =
selem "TEI"
[ teiHeader
(selem "titleStmt" [title $ headWord <> " §" <> show paragraphId])
(publicationStmt
[ selem "p" [ txt "Annotated with SpaCy by project GEODE" ] ])
(sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
, text "paragraph" paragraph (loadTsv input) ]
where
input = source </> relativePath paragraph <.> "tsv"
authorArrow = constL (Text.unpack <$> authors paragraph)
loadTsv :: FilePath -> IOSArrow b XmlTree
loadTsv input = arrIO0 (readTsv input) >>> ((arrIO die) ||| format)
where
format = toXml $< arr (bio namedEntity)
toXml (Root nodes) = selem "body" [ toXml $< constL nodes ]
toXml (Leaf (Annotation {lemma, pos, form})) =
mkelem "w" [ attr "lemma" (txt lemma)
, attr "pos" (txt pos) ]
[ txt form ]
toXml (Node namedEntity nodes) =
mkelem "rs" [ attr "type" (txt namedEntity) ] [ toXml $< constL nodes ]
main :: IO ()
main = getArgs >>= cli
where
cli [datasetPath, source, target] = readTsv datasetPath
>>= either die (mapM_ (source `to` target) . groupBy tome)
cli _ = syntax "DATASET_TSV SOURCE_DIR TARGET_DIR"