Skip to content
Snippets Groups Projects
inject-metadata-to-lexicoscope.hs 1.56 KiB
#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}

import Data.Attoparsec.Text (parseOnly)
import Data.Metadata.Article (Article(..))
--import Data.Metadata (Metadata, metadata)
import Data.Metadata (Article, articles)
import Data.Text as Text (Text, intercalate, isPrefixOf, pack, unpack)
import Data.Text.IO as Text (readFile)
import System.Environment (getArgs)
import System.FilePath ((</>), (<.>))
import System.Exit (die)
import System.Script (syntax)
import Text.Editor (editM)

to :: FilePath -> FilePath -> [Article] -> IO ()
to source target = mapM_ inject
  where
    inject a@(Article {uid}) =
      editM (addMeta a) target (source </> Text.unpack uid <.> "xml")

addMeta :: Article -> [Text] -> IO [Text]
addMeta (Article {uid, tome, rank, headWord, domains}) (firstLine:_:others) = pure $
    firstLine
  : "<corpus><doc><meta>"
  : ("fileName\t" <> uid <> ".txt")
  : ("tome\t" <> Text.pack (show tome))
  : ("rank\t" <> Text.pack (show rank))
  : ("head\t" <> headWord)
  : ("domain\t" <> intercalate " | " domains)
  : ("class\t" <> getClass domains)
  : others
metaLines _ _ = die "invalid input file"

getClass :: [Text] -> Text
getClass domainsList
  | all hasGeo domainsList = "geography"
  | any hasGeo domainsList = "some_geography"
  | otherwise = "other"
  where
    hasGeo = ("Géographie" `isPrefixOf`)

main :: IO ()
main = getArgs >>= run
  where
    run [inputCSV, source, target] = Text.readFile inputCSV
      >>= either die (source `to` target) . parseOnly articles
    run _ = syntax "METADATA_CSV_FILE SOURCE TARGET"