Skip to content
Snippets Groups Projects
Commit a462aaed authored by Alice Brenon's avatar Alice Brenon
Browse files

First draft, a good start but needs improvements and, before that, tests

parents
No related branches found
No related tags found
No related merge requests found
# ---> Haskell
dist
dist-*
cabal-dev
*.o
*.hi
*.chi
*.chs.h
*.dyn_o
*.dyn_hi
.hpc
.hsenv
.cabal-sandbox/
cabal.sandbox.config
*.prof
*.aux
*.hp
*.eventlog
.stack-work/
cabal.project.local
cabal.project.local~
.HTF/
.ghc.environment.*
# Revision history for parseTrevoux
## 0.1.0.0 -- 2021-07-16
* First draft, catching roughly half the articles (37891 / 77064)
LICENSE 0 → 100644
Copyright (c) 2021, Alice BRENON
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Alice BRENON nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import Distribution.Simple
main = defaultMain
{-# LANGUAGE NamedFieldPuns #-}
module Main where
import Data.Attoparsec.Text (parseOnly)
import Data.List (intercalate)
import Data.Text as Text (append, pack)
import Data.Text.IO as Text (readFile, writeFile)
import System.Directory (createDirectoryIfMissing)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import System.FilePath ((</>), dropExtension, takeDirectory, takeFileName)
import System.IO (Handle, IOMode(..), hClose, hPutStrLn, openFile, stderr)
import Text.Encyclopedia.Article (Article(..), InflectedPOS(..), article)
import Text.Printf (printf)
data OutputContext = OutputContext {
files :: Handle
, meta :: Handle
, targetRoot :: FilePath
}
source :: String
source = "ARTFL"
process :: OutputContext -> FilePath -> IO ()
process context filepath =
logOrOutput . parseOnly article =<< Text.readFile filepath
where
logOrOutput (Left why) = hPutStrLn stderr $ printf "%s: %s" filepath why
logOrOutput (Right parsed) = outputData context filepath parsed
outputData :: OutputContext -> FilePath -> Article -> IO ()
outputData (OutputContext {files, meta, targetRoot})
filepath
(Article {headwords, inflectedPOS, body}) = do
hPutStrLn files $ csv [textId, fileDir, fileName]
hPutStrLn meta $ csv [textId, source, renderHW, renderPOS, renderFeature]
createDirectoryIfMissing True $ destPath
Text.writeFile (destPath </> fileName) body
where
fileDir = takeFileName $ takeDirectory filepath
fileName = takeFileName filepath
destPath = targetRoot </> fileDir
textId = fileDir ++ dropExtension fileName
csv = intercalate ","
renderHW = intercalate ":" ("":headwords ++ [""])
(renderPOS, renderFeature) = maybe ("", "") extractPOS inflectedPOS
extractPOS (InflectedPOS {pos, inflection}) =
(show pos, maybe "" show inflection)
populate :: FilePath -> IO ()
populate targetRoot = do
createDirectoryIfMissing True $ targetRoot
files <- openFile (targetRoot </> "files.csv") WriteMode
meta <- openFile (targetRoot </> "meta.csv") WriteMode
hPutStrLn files "id,path,file"
hPutStrLn meta "id,source,headword,POS,features"
let context = OutputContext {files, meta, targetRoot}
getContents >>= mapM_ (process context) . lines
main :: IO ()
main = getArgs >>= handle
where
handle [targetDirectory] = populate targetDirectory
handle _ = getProgName >>= die . syntax
syntax =
printf "Syntax: %s TARGET_DIRECTORY (input files get read from stdin"
{-# LANGUAGE OverloadedStrings #-}
module Text.Encyclopedia.Article (
Article(..)
, InflectedPOS(..)
, article
, headwords_
, inflectedPOS_
, body_
) where
import Control.Applicative ((<|>), liftA2, empty, optional)
import Data.Attoparsec.Text as Atto (
(<?>), Parser, manyTill, peekChar, satisfy, sepBy1, string, takeText
, takeWhile, takeWhile1
)
import Data.Attoparsec.Combinator (lookAhead)
import Data.Char (isLower, isPunctuation, isSpace, isUpper, toLower)
import Data.Text as Text (Text, append, isPrefixOf, length, map, pack, splitAt, unpack)
data POS = Adj | Substantive | Verb deriving (Bounded, Enum, Show)
data Inflection = Masculine | Feminine deriving (Bounded, Enum, Show)
data InflectedPOS = InflectedPOS {
pos :: POS
, inflection :: Maybe Inflection
} deriving Show
data Article = Article {
headwords :: [String]
, inflectedPOS :: Maybe InflectedPOS
, body :: Text
} deriving Show
article :: Parser Article
article = Article <$> headwords_ <*> optional inflectedPOS_ <*> body_
blank :: Parser Text
blank = Atto.takeWhile isSpace
punctOrSpace :: Parser Text
punctOrSpace = takeWhile1 (liftA2 (||) isPunctuation isSpace)
abbreviation :: (Bounded a, Enum a, Show a) => Parser a
abbreviation =
recognize =<< takeWhile1 (not . isPunctuation) <* satisfy isPunctuation
where
candidates = [minBound .. maxBound]
recognize abbr = foldr (tryOne $ Text.map toLower abbr) empty candidates
tryOne read candidate next
| read `isPrefixOf` (Text.map toLower . Text.pack $ show candidate) =
pure candidate
| otherwise = next
many1Till :: Parser a -> Parser b -> Parser [a]
many1Till p after = (:) <$> p <*> manyTill p after
headwords_ :: Parser [String]
headwords_ = headword `sepBy1` headwordSeparator <* blank
where
headword =
satisfy isUpper `many1Till` lookAhead (sentenceBegining <|> punctOrSpace)
headwordSeparator =
satisfy isPunctuation <* blank <* optional (string "ou") <* blank
sentenceBegining :: Parser Text
sentenceBegining = do
upper <- satisfy isUpper
lower <- satisfy isLower
pure $ Text.pack [upper, lower]
inflectedPOS_ :: Parser InflectedPOS
inflectedPOS_ = InflectedPOS
<$> (punctOrSpace *> abbreviation <* blank)
<*> (optional abbreviation <* blank)
body_ :: Parser Text
body_ = Text.append <$> sentenceBegining <*> takeText <|> debug
where
debug = takeText >>= (empty <?>) . ("Failed reading body: " ++) . take 20 . Text.unpack
-- Initial trevouxGrammar.cabal generated by cabal init. For further
-- documentation, see http://haskell.org/cabal/users-guide/
name: trevouxGrammar
version: 0.1.0.0
synopsis: A grammar for dictionary articles
description:
This package contains a library attempting to capture a grammar of the
articles in the Dictionnaire de Trevoux (that will hopefully behave well when
applied to other dictionaries) as well as an executable building upon it to
cut headword and linguistic features and store them as metadata to ease the
work of the analyzers downstream and allow to create POS-based subcorpora.
license: BSD3
license-file: LICENSE
author: Alice BRENON
maintainer: alice.brenon@ens-lyon.fr
homepage: https://gitlab.liris.cnrs.fr/abrenon/trevouxgrammar
-- copyright:
category: Language
build-type: Simple
extra-source-files: CHANGELOG.md
cabal-version: >=1.10
library
exposed-modules: Text.Encyclopedia.Article
-- other-modules:
-- other-extensions:
build-depends: base >=4.12 && <4.13
, attoparsec
, text
hs-source-dirs: lib
default-language: Haskell2010
executable parseTrevoux
main-is: Main.hs
-- other-modules:
-- other-extensions:
build-depends: base >=4.12 && <4.13
, attoparsec
, directory
, filepath
, trevouxGrammar
, text
hs-source-dirs: app
default-language: Haskell2010
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment