Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
import Conllu.Parse (parseConllu)
import Conllu.Type (AW, CW(..), ID(..), Sent(..))
import Control.Applicative ((<|>), liftA2)
import Data.Metadata
( HasAuthors(..), Book, FromBook(..), relativePath, uid, groupBy, readTsv
, sortBy )
import Data.Metadata.Article (Article(..))
import Data.Text as Text (unpack)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>), (<.>))
import System.Script (syntax)
import Text.TEI
( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title )
import Text.XML.HXT.Core
( (>>>), (|||), ($<), IOSLA(..), IOSArrow, XmlTree, arr, arrIO, arrIO0, arrL
, attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent
, writeDocument )
to :: FilePath -> FilePath -> (Book, [Article]) -> IO ()
to source target (theBook, metadata) =
mapM_ runArrow . groupBy tome $ sortBy rank metadata
where
runArrow (tomeNumber, tomeArticles) =
runX $ compileTome source target (theBook, tomeNumber, tomeArticles)
compileTome ::
FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree
compileTome source target (theBook, tomeNumber, metadata) =
tomeXml >>> writeDocument [withIndent True] output
where
strTome = show tomeNumber
output = target </> (show theBook) </> "T" <> strTome <.> "xml"
tomeXml = selem "/"
[ selem "teiCorpus"
[ corpusHeader theBook strTome
, (articleFrom source $< constL metadata) ] ]
articleFrom :: FilePath -> Article -> IOSArrow b XmlTree
articleFrom source article@(Article {headWord}) =
selem "TEI"
[ teiHeader
(selem "titleStmt" [title $ Text.unpack headWord])
(publicationStmt
[ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ])
(sourceDesc [ authorArrow >>> selem "author" [ mkText ] ])
, text "article" article (loadConllu input) ]
where
input = source </> relativePath article <.> "conllu"
authorArrow = constL (Text.unpack <$> authors article)
{-
metadataFrom :: Article -> [ IOSArrow b XmlTree ]
metadataFrom article@(Article {book, authors, domains}) =
[ "id" .= uid article
, "book" .= show book
, "author" .= list (getAuthors authors)
, "domains" .= list (getDomains domains) ]
-}
loadConllu :: FilePath -> IOSArrow b XmlTree
loadConllu input = arrIO0 (parseConllu input <$> readFile input)
>>> ((arrIO die) ||| format)
where
format = selem "body" [ unlistA >>> formatSentence ]
formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ]
formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma)
, attr "pos" (txt $< getMaybe pos) ]
[ txt $< getMaybe _form ]
pos = liftA2 (<|>) spos _xpos
getMaybe :: (a -> Maybe b) -> IOSArrow a b
getMaybe projector = IOSLA $ \s a -> pure (s, maybe [] (:[]) $ projector a)
render :: [CW AW] -> [CW AW]
render [] = []
render (w0@(CW {_id = MID a0 b0}):w1@(CW {_id = SID a1}):w2@(CW {_id = SID b2}):ws)
| a0 == a1 && b0 == b2 =
w0 {_lemma = glue _lemma, _upos = Nothing, _xpos = glue spos}:(render ws)
where
glue f = (\a b -> a ++ "+" ++ b) <$> (f w1) <*> (f w2)
render (w:ws) = w:(render ws)
spos :: CW a -> Maybe String
spos = fmap show . _upos
main :: IO ()
main = getArgs >>= cli
where
books = groupBy book . sortBy uid
cli [inputMeta, sourceRoot, targetRoot] =
readTsv inputMeta
>>= either die (mapM_ (sourceRoot `to` targetRoot) . books)
cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR"