Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
{-# LANGUAGE OverloadedStrings #-}
import Control.Applicative ((<|>), many, optional)
import Data.Attoparsec.Text as Atto
( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string
, takeWhile )
import Data.ByteString.Lazy as BS (getContents)
import Data.Char (isUpper, toLower, toUpper)
import Data.Csv (FromRecord(..), HasHeader(..), decode)
--import Data.Csv (HasHeader(..), decode)
import Data.Text as Text (Text, pack)
import Data.Text.IO as Text (readFile)
--import Data.Text.IO as Text (getContents, readFile)
import System.Script (syntax, warn)
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Printf (printf)
parenthesis :: Parser Text
parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')'
qualification :: Parser Text
qualification =
qualifier <* optional (char 'e') <* optional (char 's') <* many1 space
where
qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary))
vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"]
ucFirst (c:cs) = toUpper c : cs
ucFirst s = s
eddaFirstWord :: String -> Parser String
eddaFirstWord headWord =
diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter
where
diderotStar = optional (char '*')
pulp = Atto.takeWhile (inClass ",. \t\n\r")
form = string (pack headWord) <|> Atto.takeWhile isUpper
meta = pulp `sepBy` (abbreviation <|> parenthesis)
abbreviation = pack <$> many1 letter <* char '.'
--gram = choice [noun, adj] <* char '.'
--noun = string "s." <* optional space <* choice [char 'm', char 'f']
--adj = string "adj"
lgeFirstWord :: String -> Parser String
lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter
where
form = string (pack headWord)
pulp = Atto.takeWhile (inClass ",. \t\n\r")
meta = many (parenthesis *> pulp)
type Triple = (Int, Int, String)
newtype EDdA = EDdA { eddaTriple :: Triple }
newtype LGE = LGE { lgeTriple :: Triple }
eddaLine :: (Int, Int, String, String, String, String) -> EDdA
eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord)
lgeLine :: (String, Int, Int, String, String) -> LGE
lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord)
instance FromRecord EDdA where
parseRecord = fmap eddaLine . parseRecord
instance FromRecord LGE where
parseRecord = fmap lgeLine . parseRecord
data BookModule a =
BookModule
{ getTriple :: a -> Triple
, relativePath :: Int -> Int -> FilePath
, firstWordParser :: String -> Atto.Parser String }
edda :: BookModule EDdA
edda = BookModule
{ getTriple = eddaTriple
, relativePath = printf "T%d/article%d.txt"
, firstWordParser = eddaFirstWord }
lge :: BookModule LGE
lge = BookModule
{ getTriple = lgeTriple
, relativePath = printf "T%d/ByRank/%d.txt"
, firstWordParser = lgeFirstWord }
extractFrom :: FromRecord a => FilePath -> BookModule a -> IO ()
extractFrom rootDirectory book =
decode HasHeader <$> BS.getContents
>>= either die (mapM_ (wordFromTriple . getTriple book))
where
wordFromTriple (tome, name, headWord) =
Text.readFile (rootDirectory </> relativePath book tome name)
>>= pure . parseOnly (firstWordParser book headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
main :: IO ()
main = getArgs >>= run
where
normalize = fmap toLower
run [book, rootDirectory]
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "edda" = extractFrom rootDirectory edda
| normalize book == "lge" = extractFrom rootDirectory lge
run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)"
{-
parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO ()
parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (eddaFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO ()
parseLGEArticle rootDirectory (_,tome, name, headWord, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
>>= pure . parseOnly (lgeFirstWord headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
extractFirstWord :: FilePath -> (String -> Parser String)
-> (String, Int, Int, String, String) -> IO ()
-- -> (Int, Int, String, String, String, String) -> IO ()
extractFirstWord rootDirectory parser (_,tome, name, headWord, _) =
--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) =
Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name))
--Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name))
>>= pure . parseOnly (parser headWord)
>>= either
(\_ -> warn $ printf "%d\t%d" tome name)
(\word -> putStrLn $ printf "%d\t%d\t%s" tome name word)
data Book = EDdA | LGE
data Triple a where
EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String)
LGE :: (String, Int, String, String, String) -> Triple (Int, String, String)
class Triple a where
getTriple :: a -> (Int, String, String)
instance Triple EDdA (Int, String, String, String, String, String) where
getTriple
getTriple :: Triple a -> a
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
data BookModule =
BookModule
{ getRecord :: Triple (Int, String, String)
, relativePath :: (Int, String) -> FilePath
, firstWordParser :: Atto.Parser Text }
eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String)
eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord)
eddaPath
lgeLine :: (String, Int, String, String, String) -> (Int, String, String)
lgeLine (_,tome, name, headWord, _) = (tome, name, headWord)
data Format =
EDdA (Int, String, String, String, String, String)
| LGE (String, Int, String, String, String)
getTriple :: Format -> Triple
getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord)
getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord)
-}