Skip to content
Snippets Groups Projects
extract-first-word.hs 6.51 KiB
Newer Older
{-# LANGUAGE OverloadedStrings #-}
import Control.Applicative ((<|>), many, optional)
import Data.Attoparsec.Text as Atto
  ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
  , takeWhile )
import Data.ByteString.Lazy as BS (getContents)
import Data.Char (isUpper, toLower, toUpper)
import Data.Csv (FromRecord(..), HasHeader(..), decode)
--import Data.Csv (HasHeader(..), decode)
import Data.Text as Text (Text, pack)
import Data.Text.IO as Text (readFile)
--import Data.Text.IO as Text (getContents, readFile)
import System.Script (syntax, warn)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Printf (printf)

parenthesis :: Parser Text
parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'

qualification :: Parser Text
qualification =
  qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
  where
    qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
    vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
    ucFirst (c:cs) = toUpper c : cs
    ucFirst s = s

eddaFirstWord :: String -> Parser String
eddaFirstWord headWord =
  diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
  where
    diderotStar = optional (char '*')
    pulp = Atto.takeWhile (inClass ",. \t\n\r")
    form = string (pack headWord) <|> Atto.takeWhile isUpper
    meta = pulp `sepBy` (abbreviation <|> parenthesis)
    abbreviation = pack <$> many1 letter <* char '.'
    --gram = choice [noun, adj] <* char '.'
    --noun = string "s." <* optional space <* choice [char 'm', char 'f']
    --adj = string "adj"

lgeFirstWord :: String -> Parser String
lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
  where
    form = string (pack headWord)
    pulp = Atto.takeWhile (inClass ",. \t\n\r")
    meta = many (parenthesis *> pulp)

type Triple = (Int, Int, String)
newtype EDdA = EDdA { eddaTriple :: Triple }
newtype LGE = LGE { lgeTriple :: Triple }

eddaLine :: (Int, Int, String, String, String, String) -> EDdA
eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord)

lgeLine :: (String, Int, Int, String, String) -> LGE
lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord)

instance FromRecord EDdA where
  parseRecord = fmap eddaLine . parseRecord

instance FromRecord LGE where
  parseRecord = fmap lgeLine . parseRecord

data BookModule a =
  BookModule
    { getTriple :: a -> Triple
    , relativePath :: Int -> Int -> FilePath
    , firstWordParser :: String -> Atto.Parser String }

edda :: BookModule EDdA
edda = BookModule
         { getTriple = eddaTriple
         , relativePath = printf "T%d/article%d.txt"
         , firstWordParser = eddaFirstWord }

lge :: BookModule LGE
lge = BookModule
         { getTriple = lgeTriple
         , relativePath = printf "T%d/ByRank/%d.txt"
         , firstWordParser = lgeFirstWord }

extractFrom :: FromRecord a => FilePath -> BookModule a -> IO ()
extractFrom rootDirectory book =
  decode HasHeader <$> BS.getContents
  >>= either die (mapM_ (wordFromTriple . getTriple book))
  where
    wordFromTriple (tome, name, headWord) =
      Text.readFile (rootDirectory </> relativePath book tome name)
      >>= pure . parseOnly (firstWordParser book headWord)
      >>= either
            (\_ -> warn $ printf "%d\t%d" tome name)
            (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)

main :: IO ()
main = getArgs >>= run
  where
    normalize = fmap toLower
    run [book, rootDirectory]
      | normalize book == "edda" = extractFrom rootDirectory edda
      | normalize book == "edda" = extractFrom rootDirectory edda
      | normalize book == "lge" = extractFrom rootDirectory lge
    run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"

{-

parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO ()
parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) =
  Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
  >>= pure . parseOnly (eddaFirstWord headWord)
  >>= either
        (\_ -> warn $ printf "%d\t%d" tome name)
        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)

parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO ()
parseLGEArticle rootDirectory (_,tome, name, headWord, _) =
  Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
  >>= pure . parseOnly (lgeFirstWord headWord)
  >>= either
        (\_ -> warn $ printf "%d\t%d" tome name)
        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)

extractFirstWord :: FilePath -> (String -> Parser String)
  -> (String, Int, Int, String, String) -> IO ()
  -- -> (Int, Int, String, String, String, String) -> IO ()
extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
  Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
  --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
  >>= pure . parseOnly (parser headWord)
  >>= either
        (\_ -> warn $ printf "%d\t%d" tome name)
        (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)

data Book = EDdA | LGE

data Triple a where
  EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String)
  LGE :: (String, Int, String, String, String) -> Triple (Int, String, String)

class Triple a where
  getTriple :: a -> (Int, String, String)

instance Triple EDdA (Int, String, String, String, String, String) where
  getTriple 

getTriple :: Triple a -> a
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)

data BookModule =
  BookModule
    { getRecord :: Triple (Int, String, String)
    , relativePath :: (Int, String) -> FilePath
    , firstWordParser :: Atto.Parser Text }

eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String)
eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord)

eddaPath

lgeLine :: (String, Int, String, String, String) -> (Int, String, String)
lgeLine (_,tome, name, headWord, _) = (tome, name, headWord)

data Format =
  EDdA (Int, String, String, String, String, String) 
  | LGE (String, Int, String, String, String)

getTriple :: Format -> Triple
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)

-}