Skip to content
Snippets Groups Projects
TEI.hs 3.51 KiB
module Text.TEI
  ( (.=)
  , corpusHeader
  , publicationStmt
  , sourceDesc
  , teiHeader
  , text
  , title ) where

import Data.Text (unpack)
import GEODE.Metadata
  (Book(..), Contrastive(..), Has(..), PrimaryKey(..), formatList, uid)
import Text.XML.HXT.Core (ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt)

corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree
corpusHeader EDdA = eddaHeader
corpusHeader LGE = lgeHeader
corpusHeader Wikipedia = wikiHeader

eddaHeader :: ArrowXml a => String -> a b XmlTree
eddaHeader strTome =
  teiHeader
    (titleStmt strTome ("Digitized" `by` "University of Chicago Library"
                       ++ "Published" `by` "ARTFL"))
    (corpusPublicationStmt)
    (sourceDesc
      [ title ("L'Encyclopédie T" <> strTome)
      , selem "author" [ txt "Collective" ]
      , selem "creation" [ selem "date" [ txt "1752" ] ] ])

lgeHeader :: ArrowXml a => String -> a b XmlTree
lgeHeader strTome =
  teiHeader
    (titleStmt strTome ("Digitized" `by` "Bibliothèque Nationale de France"))
    (corpusPublicationStmt)
    (sourceDesc
      [ title ("La Grande Encyclopédie T" <> strTome)
      , selem "author" [ txt "Collective" ]
      , selem "creation" [ selem "date" [ txt "1885" ] ]
      , selem "imprint" [ selem "date" [ txt "1885" ]
                        , selem "publisher" [ txt "H. Lamirault et Cie," ]
                        , selem "pubplace" [ txt "Paris" ] ]
      , mkelem "biblScope" [ attr "unit" (txt "volume") ] [ txt strTome ] ])

wikiHeader :: ArrowXml a => String -> a b XmlTree
wikiHeader _ =
  teiHeader
    (titleStmt "Wikipédia, L'encyclopédie libre" [])
    (corpusPublicationStmt)
    (sourceDesc
      [ selem "author" [ selem "orgName" [ txt "Wikimedia Foundation" ] ] ])

teiHeader :: ArrowXml a =>
  a b XmlTree -> a b XmlTree -> a b XmlTree -> a b XmlTree
teiHeader aTitle aPublication aSource =
  selem "teiHeader"
    [ selem "fileDesc" [ aTitle, aPublication, aSource ] ]

titleStmt :: ArrowXml a => String -> [a b XmlTree] -> a b XmlTree
titleStmt tome resps =
  selem "titleStmt"
    [ title tome
    , selem "respStmt" (resps ++ "Annotated and encoded" `by` "ICAR") ]

title :: ArrowXml a => String -> a b XmlTree
title titleContent = selem "title" [txt titleContent]

corpusPublicationStmt :: ArrowXml a => a b XmlTree
corpusPublicationStmt =
  (publicationStmt
    [selem "distributor"
      [selem "orgName" [txt "Project GÉODE"]
      ,selem "address"
          [selem "addrline" [txt "ICAR UMR 5191"]
          ,selem "addrline" [txt "ENS de Lyon"]]]])

publicationStmt :: ArrowXml a => [a b XmlTree] -> a b XmlTree
publicationStmt = selem "publicationStmt"

sourceDesc :: ArrowXml a => [a b XmlTree] -> a b XmlTree
sourceDesc biblContent =
  selem "sourceDesc" [selem "bibl" biblContent]

by :: ArrowXml a => String -> String -> [a b XmlTree]
by resp orgName = [ selem "resp" [ txt (resp <> " by") ]
                  , selem "orgName" [ txt orgName ] ]

type TXMText a = (Has Contrastive a, Has PrimaryKey a)

text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree
text unitName txmText body =
  mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ]
  where
    metadata =
      [ "uid" .= uid txmText
      , "book" .= (show . book $ get txmText)
      , "author" .= (unpack . formatList . authors $ get txmText)
      , "domains" .= (unpack . formatList . domains $ get txmText)
      , "subCorpus" .= (unpack . formatList . subCorpus $ get txmText) ]

(.=) :: ArrowXml a => String -> String -> a b XmlTree
key .= value = attr key (txt value)