diff --git a/scripts/extract-first-word.hs b/scripts/extract-first-word.hs deleted file mode 100644 index d41da32f35d91693abad6353365bc96ba4bce71d..0000000000000000000000000000000000000000 --- a/scripts/extract-first-word.hs +++ /dev/null @@ -1,175 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -import Control.Applicative ((<|>), many, optional) -import Data.Attoparsec.Text as Atto - ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string - , takeWhile ) -import Data.ByteString.Lazy as BS (getContents) -import Data.Char (isUpper, toLower, toUpper) -import Data.Csv (FromRecord(..), HasHeader(..), decode) ---import Data.Csv (HasHeader(..), decode) -import Data.Text as Text (Text, pack) -import Data.Text.IO as Text (readFile) ---import Data.Text.IO as Text (getContents, readFile) -import System.Script (syntax, warn) -import System.Environment (getArgs) -import System.Exit (die) -import System.FilePath ((</>)) -import Text.Printf (printf) - -parenthesis :: Parser Text -parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')' - -qualification :: Parser Text -qualification = - qualifier <* optional (char 'e') <* optional (char 's') <* many1 space - where - qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary)) - vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"] - ucFirst (c:cs) = toUpper c : cs - ucFirst s = s - -eddaFirstWord :: String -> Parser String -eddaFirstWord headWord = - diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter - where - diderotStar = optional (char '*') - pulp = Atto.takeWhile (inClass ",. \t\n\r") - form = string (pack headWord) <|> Atto.takeWhile isUpper - meta = pulp `sepBy` (abbreviation <|> parenthesis) - abbreviation = pack <$> many1 letter <* char '.' - --gram = choice [noun, adj] <* char '.' - --noun = string "s." <* optional space <* choice [char 'm', char 'f'] - --adj = string "adj" - -lgeFirstWord :: String -> Parser String -lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter - where - form = string (pack headWord) - pulp = Atto.takeWhile (inClass ",. \t\n\r") - meta = many (parenthesis *> pulp) - -type Triple = (Int, Int, String) -newtype EDdA = EDdA { eddaTriple :: Triple } -newtype LGE = LGE { lgeTriple :: Triple } - -eddaLine :: (Int, Int, String, String, String, String) -> EDdA -eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord) - -lgeLine :: (String, Int, Int, String, String) -> LGE -lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord) - -instance FromRecord EDdA where - parseRecord = fmap eddaLine . parseRecord - -instance FromRecord LGE where - parseRecord = fmap lgeLine . parseRecord - -data BookModule a = - BookModule - { getTriple :: a -> Triple - , relativePath :: Int -> Int -> FilePath - , firstWordParser :: String -> Atto.Parser String } - -edda :: BookModule EDdA -edda = BookModule - { getTriple = eddaTriple - , relativePath = printf "T%d/article%d.txt" - , firstWordParser = eddaFirstWord } - -lge :: BookModule LGE -lge = BookModule - { getTriple = lgeTriple - , relativePath = printf "T%d/ByRank/%d.txt" - , firstWordParser = lgeFirstWord } - -extractFrom :: FromRecord a => FilePath -> BookModule a -> IO () -extractFrom rootDirectory book = - decode HasHeader <$> BS.getContents - >>= either die (mapM_ (wordFromTriple . getTriple book)) - where - wordFromTriple (tome, name, headWord) = - Text.readFile (rootDirectory </> relativePath book tome name) - >>= pure . parseOnly (firstWordParser book headWord) - >>= either - (\_ -> warn $ printf "%d\t%d" tome name) - (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) - -main :: IO () -main = getArgs >>= run - where - normalize = fmap toLower - run [book, rootDirectory] - | normalize book == "edda" = extractFrom rootDirectory edda - | normalize book == "edda" = extractFrom rootDirectory edda - | normalize book == "lge" = extractFrom rootDirectory lge - run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)" - -{- - -parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO () -parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) = - Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) - >>= pure . parseOnly (eddaFirstWord headWord) - >>= either - (\_ -> warn $ printf "%d\t%d" tome name) - (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) - -parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO () -parseLGEArticle rootDirectory (_,tome, name, headWord, _) = - Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) - >>= pure . parseOnly (lgeFirstWord headWord) - >>= either - (\_ -> warn $ printf "%d\t%d" tome name) - (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) - -extractFirstWord :: FilePath -> (String -> Parser String) - -> (String, Int, Int, String, String) -> IO () - -- -> (Int, Int, String, String, String, String) -> IO () -extractFirstWord rootDirectory parser (_,tome, name, headWord, _) = ---extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) = - Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) - --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) - >>= pure . parseOnly (parser headWord) - >>= either - (\_ -> warn $ printf "%d\t%d" tome name) - (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) - -data Book = EDdA | LGE - -data Triple a where - EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String) - LGE :: (String, Int, String, String, String) -> Triple (Int, String, String) - -class Triple a where - getTriple :: a -> (Int, String, String) - -instance Triple EDdA (Int, String, String, String, String, String) where - getTriple - -getTriple :: Triple a -> a -getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) -getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) - -data BookModule = - BookModule - { getRecord :: Triple (Int, String, String) - , relativePath :: (Int, String) -> FilePath - , firstWordParser :: Atto.Parser Text } - -eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String) -eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord) - -eddaPath - -lgeLine :: (String, Int, String, String, String) -> (Int, String, String) -lgeLine (_,tome, name, headWord, _) = (tome, name, headWord) - -data Format = - EDdA (Int, String, String, String, String, String) - | LGE (String, Int, String, String, String) - -getTriple :: Format -> Triple -getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) -getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) - --} diff --git a/scripts/textometry/extract-first-word.hs b/scripts/textometry/extract-first-word.hs new file mode 100755 index 0000000000000000000000000000000000000000..930c8ef22c4897ea34869e81c61e1d02108aaeb7 --- /dev/null +++ b/scripts/textometry/extract-first-word.hs @@ -0,0 +1,71 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" +{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} +import Control.Applicative ((<|>), many, optional) +import Data.Attoparsec.Text as Atto + ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string + , takeWhile ) +import Data.Char (isUpper, toUpper) +import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) +import Data.Text as Text (Text, pack) +import Data.Text.IO as Text (readFile) +import GEODE.Metadata + ( type(@)(..), ArticleRecord(..), Document(..), Entry(..), ReadTSV(..) + , Record(..), Work(..), WriteTSV(..), for, getHeader, glue ) +import GHC.Generics (Generic) +import System.Script (syntax, warn) +import System.Environment (getArgs) +import System.FilePath ((</>)) + +parenthesis :: Parser Text +parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')' + +qualification :: Parser Text +qualification = + qualifier <* optional (char 'e') <* optional (char 's') <* many1 space + where + qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary)) + vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"] + ucFirst (c:cs) = toUpper c : cs + ucFirst s = s + +eddaFirstWord :: Text -> Parser String +eddaFirstWord headword = + diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter + where + diderotStar = optional (char '*') + pulp = Atto.takeWhile (inClass ",. \t\n\r") + form = string headword <|> Atto.takeWhile isUpper + meta = pulp `sepBy` (abbreviation <|> parenthesis) + abbreviation = pack <$> many1 letter <* char '.' + +lgeFirstWord :: Text -> Parser String +lgeFirstWord headword = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter + where + form = string headword + pulp = Atto.takeWhile (inClass ",. \t\n\r") + meta = many (parenthesis *> pulp) + +newtype FirstWord = FirstWord { firstWord :: String } deriving (Show, Generic) +instance DefaultOrdered FirstWord +instance ToNamedRecord FirstWord + +type Result = ArticleRecord @ FirstWord + +search :: FilePath -> (ArticleRecord @ Entry) -> IO () +search rootDirectory (articleRecord :@: entry) = + Text.readFile (rootDirectory </> relativePath articleRecord "txt") + >>= pure . parseOnly (firstWordParser $ headword entry) + >>= either warn (writeTSV () . (:[]) . glue articleRecord . FirstWord) + where + firstWordParser | work articleRecord == EDdA = eddaFirstWord + | otherwise = lgeFirstWord + +main :: IO () +main = getArgs >>= run + where + run [inputTsv, rootDirectory] = + readTSV inputTsv >>= extractFrom rootDirectory . rows + run _ = syntax "INPUT_TSV SOURCE_DIRECTORY" + extractFrom rootDirectory rows = do + writeTSV () [getHeader (for :: Result)] + mapM_ (search rootDirectory) rows