{-# LANGUAGE OverloadedStrings #-} import Control.Applicative ((<|>), many, optional) import Data.Attoparsec.Text as Atto ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string , takeWhile ) import Data.ByteString.Lazy as BS (getContents) import Data.Char (isUpper, toLower, toUpper) import Data.Csv (FromRecord(..), HasHeader(..), decode) --import Data.Csv (HasHeader(..), decode) import Data.Text as Text (Text, pack) import Data.Text.IO as Text (readFile) --import Data.Text.IO as Text (getContents, readFile) import System.Script (syntax, warn) import System.Environment (getArgs) import System.Exit (die) import System.FilePath ((</>)) import Text.Printf (printf) parenthesis :: Parser Text parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')' qualification :: Parser Text qualification = qualifier <* optional (char 'e') <* optional (char 's') <* many1 space where qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary)) vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"] ucFirst (c:cs) = toUpper c : cs ucFirst s = s eddaFirstWord :: String -> Parser String eddaFirstWord headWord = diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter where diderotStar = optional (char '*') pulp = Atto.takeWhile (inClass ",. \t\n\r") form = string (pack headWord) <|> Atto.takeWhile isUpper meta = pulp `sepBy` (abbreviation <|> parenthesis) abbreviation = pack <$> many1 letter <* char '.' --gram = choice [noun, adj] <* char '.' --noun = string "s." <* optional space <* choice [char 'm', char 'f'] --adj = string "adj" lgeFirstWord :: String -> Parser String lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter where form = string (pack headWord) pulp = Atto.takeWhile (inClass ",. \t\n\r") meta = many (parenthesis *> pulp) type Triple = (Int, Int, String) newtype EDdA = EDdA { eddaTriple :: Triple } newtype LGE = LGE { lgeTriple :: Triple } eddaLine :: (Int, Int, String, String, String, String) -> EDdA eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord) lgeLine :: (String, Int, Int, String, String) -> LGE lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord) instance FromRecord EDdA where parseRecord = fmap eddaLine . parseRecord instance FromRecord LGE where parseRecord = fmap lgeLine . parseRecord data BookModule a = BookModule { getTriple :: a -> Triple , relativePath :: Int -> Int -> FilePath , firstWordParser :: String -> Atto.Parser String } edda :: BookModule EDdA edda = BookModule { getTriple = eddaTriple , relativePath = printf "T%d/article%d.txt" , firstWordParser = eddaFirstWord } lge :: BookModule LGE lge = BookModule { getTriple = lgeTriple , relativePath = printf "T%d/ByRank/%d.txt" , firstWordParser = lgeFirstWord } extractFrom :: FromRecord a => FilePath -> BookModule a -> IO () extractFrom rootDirectory book = decode HasHeader <$> BS.getContents >>= either die (mapM_ (wordFromTriple . getTriple book)) where wordFromTriple (tome, name, headWord) = Text.readFile (rootDirectory </> relativePath book tome name) >>= pure . parseOnly (firstWordParser book headWord) >>= either (\_ -> warn $ printf "%d\t%d" tome name) (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) main :: IO () main = getArgs >>= run where normalize = fmap toLower run [book, rootDirectory] | normalize book == "edda" = extractFrom rootDirectory edda | normalize book == "edda" = extractFrom rootDirectory edda | normalize book == "lge" = extractFrom rootDirectory lge run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)" {- parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO () parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) = Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) >>= pure . parseOnly (eddaFirstWord headWord) >>= either (\_ -> warn $ printf "%d\t%d" tome name) (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO () parseLGEArticle rootDirectory (_,tome, name, headWord, _) = Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) >>= pure . parseOnly (lgeFirstWord headWord) >>= either (\_ -> warn $ printf "%d\t%d" tome name) (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) extractFirstWord :: FilePath -> (String -> Parser String) -> (String, Int, Int, String, String) -> IO () -- -> (Int, Int, String, String, String, String) -> IO () extractFirstWord rootDirectory parser (_,tome, name, headWord, _) = --extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) = Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) >>= pure . parseOnly (parser headWord) >>= either (\_ -> warn $ printf "%d\t%d" tome name) (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) data Book = EDdA | LGE data Triple a where EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String) LGE :: (String, Int, String, String, String) -> Triple (Int, String, String) class Triple a where getTriple :: a -> (Int, String, String) instance Triple EDdA (Int, String, String, String, String, String) where getTriple getTriple :: Triple a -> a getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) data BookModule = BookModule { getRecord :: Triple (Int, String, String) , relativePath :: (Int, String) -> FilePath , firstWordParser :: Atto.Parser Text } eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String) eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord) eddaPath lgeLine :: (String, Int, String, String, String) -> (Int, String, String) lgeLine (_,tome, name, headWord, _) = (tome, name, headWord) data Format = EDdA (Int, String, String, String, String, String) | LGE (String, Int, String, String, String) getTriple :: Format -> Triple getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) -}