-
Alice Brenon authored1e28d593
extract-first-word.hs 6.51 KiB
{-# LANGUAGE OverloadedStrings #-}
import Control.Applicative ((<|>), many, optional)
import Data.Attoparsec.Text as Atto
( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
, takeWhile )
import Data.ByteString.Lazy as BS (getContents)
import Data.Char (isUpper, toLower, toUpper)
import Data.Csv (FromRecord(..), HasHeader(..), decode)
--import Data.Csv (HasHeader(..), decode)
import Data.Text as Text (Text, pack)
import Data.Text.IO as Text (readFile)
--import Data.Text.IO as Text (getContents, readFile)
import System.Script (syntax, warn)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Printf (printf)
parenthesis :: Parser Text
parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
qualification :: Parser Text
qualification =
qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
where
qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
ucFirst (c:cs) = toUpper c : cs
ucFirst s = s
eddaFirstWord :: String -> Parser String
eddaFirstWord headWord =
diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
where
diderotStar = optional (char '*')
pulp = Atto.takeWhile (inClass ",. \t\n\r")
form = string (pack headWord) <|> Atto.takeWhile isUpper
meta = pulp `sepBy` (abbreviation <|> parenthesis)
abbreviation = pack <$> many1 letter <* char '.'
--gram = choice [noun, adj] <* char '.'
--noun = string "s." <* optional space <* choice [char 'm', char 'f']
--adj = string "adj"
lgeFirstWord :: String -> Parser String
lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
where
form = string (pack headWord)
pulp = Atto.takeWhile (inClass ",. \t\n\r")
meta = many (parenthesis *> pulp)
type Triple = (Int, Int, String)
newtype EDdA = EDdA { eddaTriple :: Triple }
newtype LGE = LGE { lgeTriple :: Triple }
eddaLine :: (Int, Int, String, String, String, String) -> EDdA
eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord)
lgeLine :: (String, Int, Int, String, String) -> LGE
lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord)
instance FromRecord EDdA where
parseRecord = fmap eddaLine . parseRecord
instance FromRecord LGE where
parseRecord = fmap lgeLine . parseRecord
data BookModule a =
BookModule
{ getTriple :: a -> Triple
, relativePath :: Int -> Int -> FilePath
, firstWordParser :: String -> Atto.Parser String }
edda :: BookModule EDdA
edda = BookModule
{ getTriple = eddaTriple
, relativePath = printf "T%d/article%d.txt"
, firstWordParser = eddaFirstWord }
lge :: BookModule LGE
lge = BookModule
{ getTriple = lgeTriple
, relativePath = printf "T%d/ByRank/%d.txt"
, firstWordParser = lgeFirstWord }
extractFrom :: FromRecord a => FilePath -> BookModule a -> IO ()
extractFrom rootDirectory book =
decode HasHeader <$> BS.getContents
>>= either die (mapM_ (wordFromTriple . getTriple book))
where
wordFromTriple (tome, name, headWord) =
Text.readFile (rootDirectory </> relativePath book tome name)
>>= pure . parseOnly (firstWordParser book headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
main :: IO ()
main = getArgs >>= run
where
normalize = fmap toLower
run [book, rootDirectory]
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "lge" = extractFrom rootDirectory lge
run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"
{-
parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO ()
parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (eddaFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO ()
parseLGEArticle rootDirectory (_,tome, name, headWord, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
>>= pure . parseOnly (lgeFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
extractFirstWord :: FilePath -> (String -> Parser String)
-> (String, Int, Int, String, String) -> IO ()
-- -> (Int, Int, String, String, String, String) -> IO ()
extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
--Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (parser headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
data Book = EDdA | LGE
data Triple a where
EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String)
LGE :: (String, Int, String, String, String) -> Triple (Int, String, String)
class Triple a where
getTriple :: a -> (Int, String, String)
instance Triple EDdA (Int, String, String, String, String, String) where
getTriple
getTriple :: Triple a -> a
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
data BookModule =
BookModule
{ getRecord :: Triple (Int, String, String)
, relativePath :: (Int, String) -> FilePath
, firstWordParser :: Atto.Parser Text }
eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String)
eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord)
eddaPath
lgeLine :: (String, Int, String, String, String) -> (Int, String, String)
lgeLine (_,tome, name, headWord, _) = (tome, name, headWord)
data Format =
EDdA (Int, String, String, String, String, String)
| LGE (String, Int, String, String, String)
getTriple :: Format -> Triple
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
-}