Skip to content
Snippets Groups Projects
Commit d23ab03b authored by Alice Brenon's avatar Alice Brenon
Browse files

Move the applicative parser to detect first words into the textometry directory

parent 612e2a31
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE OverloadedStrings #-}
import Control.Applicative ((<|>), many, optional)
import Data.Attoparsec.Text as Atto
( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
, takeWhile )
import Data.ByteString.Lazy as BS (getContents)
import Data.Char (isUpper, toLower, toUpper)
import Data.Csv (FromRecord(..), HasHeader(..), decode)
--import Data.Csv (HasHeader(..), decode)
import Data.Text as Text (Text, pack)
import Data.Text.IO as Text (readFile)
--import Data.Text.IO as Text (getContents, readFile)
import System.Script (syntax, warn)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Printf (printf)
parenthesis :: Parser Text
parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
qualification :: Parser Text
qualification =
qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
where
qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
ucFirst (c:cs) = toUpper c : cs
ucFirst s = s
eddaFirstWord :: String -> Parser String
eddaFirstWord headWord =
diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
where
diderotStar = optional (char '*')
pulp = Atto.takeWhile (inClass ",. \t\n\r")
form = string (pack headWord) <|> Atto.takeWhile isUpper
meta = pulp `sepBy` (abbreviation <|> parenthesis)
abbreviation = pack <$> many1 letter <* char '.'
--gram = choice [noun, adj] <* char '.'
--noun = string "s." <* optional space <* choice [char 'm', char 'f']
--adj = string "adj"
lgeFirstWord :: String -> Parser String
lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
where
form = string (pack headWord)
pulp = Atto.takeWhile (inClass ",. \t\n\r")
meta = many (parenthesis *> pulp)
type Triple = (Int, Int, String)
newtype EDdA = EDdA { eddaTriple :: Triple }
newtype LGE = LGE { lgeTriple :: Triple }
eddaLine :: (Int, Int, String, String, String, String) -> EDdA
eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord)
lgeLine :: (String, Int, Int, String, String) -> LGE
lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord)
instance FromRecord EDdA where
parseRecord = fmap eddaLine . parseRecord
instance FromRecord LGE where
parseRecord = fmap lgeLine . parseRecord
data BookModule a =
BookModule
{ getTriple :: a -> Triple
, relativePath :: Int -> Int -> FilePath
, firstWordParser :: String -> Atto.Parser String }
edda :: BookModule EDdA
edda = BookModule
{ getTriple = eddaTriple
, relativePath = printf "T%d/article%d.txt"
, firstWordParser = eddaFirstWord }
lge :: BookModule LGE
lge = BookModule
{ getTriple = lgeTriple
, relativePath = printf "T%d/ByRank/%d.txt"
, firstWordParser = lgeFirstWord }
extractFrom :: FromRecord a => FilePath -> BookModule a -> IO ()
extractFrom rootDirectory book =
decode HasHeader <$> BS.getContents
>>= either die (mapM_ (wordFromTriple . getTriple book))
where
wordFromTriple (tome, name, headWord) =
Text.readFile (rootDirectory </> relativePath book tome name)
>>= pure . parseOnly (firstWordParser book headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
main :: IO ()
main = getArgs >>= run
where
normalize = fmap toLower
run [book, rootDirectory]
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "lge" = extractFrom rootDirectory lge
run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"
{-
parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO ()
parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (eddaFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO ()
parseLGEArticle rootDirectory (_,tome, name, headWord, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
>>= pure . parseOnly (lgeFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
extractFirstWord :: FilePath -> (String -> Parser String)
-> (String, Int, Int, String, String) -> IO ()
-- -> (Int, Int, String, String, String, String) -> IO ()
extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
--Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (parser headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
data Book = EDdA | LGE
data Triple a where
EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String)
LGE :: (String, Int, String, String, String) -> Triple (Int, String, String)
class Triple a where
getTriple :: a -> (Int, String, String)
instance Triple EDdA (Int, String, String, String, String, String) where
getTriple
getTriple :: Triple a -> a
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
data BookModule =
BookModule
{ getRecord :: Triple (Int, String, String)
, relativePath :: (Int, String) -> FilePath
, firstWordParser :: Atto.Parser Text }
eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String)
eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord)
eddaPath
lgeLine :: (String, Int, String, String, String) -> (Int, String, String)
lgeLine (_,tome, name, headWord, _) = (tome, name, headWord)
data Format =
EDdA (Int, String, String, String, String, String)
| LGE (String, Int, String, String, String)
getTriple :: Format -> Triple
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
-}
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Control.Applicative ((<|>), many, optional)
import Data.Attoparsec.Text as Atto
( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
, takeWhile )
import Data.Char (isUpper, toUpper)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
import Data.Text as Text (Text, pack)
import Data.Text.IO as Text (readFile)
import GEODE.Metadata
( type(@)(..), ArticleRecord(..), Document(..), Entry(..), ReadTSV(..)
, Record(..), Work(..), WriteTSV(..), for, getHeader, glue )
import GHC.Generics (Generic)
import System.Script (syntax, warn)
import System.Environment (getArgs)
import System.FilePath ((</>))
parenthesis :: Parser Text
parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
qualification :: Parser Text
qualification =
qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
where
qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
ucFirst (c:cs) = toUpper c : cs
ucFirst s = s
eddaFirstWord :: Text -> Parser String
eddaFirstWord headword =
diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
where
diderotStar = optional (char '*')
pulp = Atto.takeWhile (inClass ",. \t\n\r")
form = string headword <|> Atto.takeWhile isUpper
meta = pulp `sepBy` (abbreviation <|> parenthesis)
abbreviation = pack <$> many1 letter <* char '.'
lgeFirstWord :: Text -> Parser String
lgeFirstWord headword = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
where
form = string headword
pulp = Atto.takeWhile (inClass ",. \t\n\r")
meta = many (parenthesis *> pulp)
newtype FirstWord = FirstWord { firstWord :: String } deriving (Show, Generic)
instance DefaultOrdered FirstWord
instance ToNamedRecord FirstWord
type Result = ArticleRecord @ FirstWord
search :: FilePath -> (ArticleRecord @ Entry) -> IO ()
search rootDirectory (articleRecord :@: entry) =
Text.readFile (rootDirectory </> relativePath articleRecord "txt")
>>= pure . parseOnly (firstWordParser $ headword entry)
>>= either warn (writeTSV () . (:[]) . glue articleRecord . FirstWord)
where
firstWordParser | work articleRecord == EDdA = eddaFirstWord
| otherwise = lgeFirstWord
main :: IO ()
main = getArgs >>= run
where
run [inputTsv, rootDirectory] =
readTSV inputTsv >>= extractFrom rootDirectory . rows
run _ = syntax "INPUT_TSV SOURCE_DIRECTORY"
extractFrom rootDirectory rows = do
writeTSV () [getHeader (for :: Result)]
mapM_ (search rootDirectory) rows
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment