-
Alice Brenon authoredf8b70bc4
inject-metadata-to-lexicoscope.hs 1.56 KiB
#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
import Data.Attoparsec.Text (parseOnly)
import Data.Metadata.Article (Article(..))
--import Data.Metadata (Metadata, metadata)
import Data.Metadata (Article, articles)
import Data.Text as Text (Text, intercalate, isPrefixOf, pack, unpack)
import Data.Text.IO as Text (readFile)
import System.Environment (getArgs)
import System.FilePath ((</>), (<.>))
import System.Exit (die)
import System.Script (syntax)
import Text.Editor (editM)
to :: FilePath -> FilePath -> [Article] -> IO ()
to source target = mapM_ inject
where
inject a@(Article {uid}) =
editM (addMeta a) target (source </> Text.unpack uid <.> "xml")
addMeta :: Article -> [Text] -> IO [Text]
addMeta (Article {uid, tome, rank, headWord, domains}) (firstLine:_:others) = pure $
firstLine
: "<corpus><doc><meta>"
: ("fileName\t" <> uid <> ".txt")
: ("tome\t" <> Text.pack (show tome))
: ("rank\t" <> Text.pack (show rank))
: ("head\t" <> headWord)
: ("domain\t" <> intercalate " | " domains)
: ("class\t" <> getClass domains)
: others
metaLines _ _ = die "invalid input file"
getClass :: [Text] -> Text
getClass domainsList
| all hasGeo domainsList = "geography"
| any hasGeo domainsList = "some_geography"
| otherwise = "other"
where
hasGeo = ("Géographie" `isPrefixOf`)
main :: IO ()
main = getArgs >>= run
where
run [inputCSV, source, target] = Text.readFile inputCSV
>>= either die (source `to` target) . parseOnly articles
run _ = syntax "METADATA_CSV_FILE SOURCE TARGET"