From f22f1393b60c4611d4bb658e54d8939b5a53dfc1 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Fri, 19 Mar 2021 20:12:32 +0100 Subject: [PATCH] Add support for --strictTEI option to enforce the use of TEI's LLA attributes only --- app/CLI.hs | 15 ++++- lib/Text/TEIWA/Config.hs | 2 + lib/Text/TEIWA/Error.hs | 8 ++- lib/Text/TEIWA/Source.hs | 62 ++++++++++++------- lib/Text/TEIWA/Source/CSV.hs | 44 ------------- .../TEIWA/Source/{ConLLX.hs => CoNLLX.hs} | 27 ++------ lib/Text/TEIWA/Source/Common.hs | 35 ++++++++++- lib/Text/TEIWA/Source/SSV.hs | 24 +++++++ teiwa.cabal | 4 +- 9 files changed, 123 insertions(+), 98 deletions(-) delete mode 100644 lib/Text/TEIWA/Source/CSV.hs rename lib/Text/TEIWA/Source/{ConLLX.hs => CoNLLX.hs} (50%) create mode 100644 lib/Text/TEIWA/Source/SSV.hs diff --git a/app/CLI.hs b/app/CLI.hs index 8d35258..7117008 100644 --- a/app/CLI.hs +++ b/app/CLI.hs @@ -4,6 +4,7 @@ module CLI ( , getCommand ) where +import Data.Text.Lazy as Text (Text, split) import Data.Version (showVersion) import Control.Applicative ((<*>), optional) import Options.Applicative ( @@ -21,19 +22,29 @@ data Command = Command { charPredicate :: ReadM (Char -> Bool) charPredicate = flip elem <$> (str :: ReadM String) +csv :: ReadM [Text] +csv = split (== ',') <$> str + configOptions :: Parser Config configOptions = Config - <$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn + <$> option (optional str) ( + short 'c' <> long "formColumn" <> value formColumn + <> metavar "COLUMN_NAME" <> help "the column to use as the form" ) + <*> option (optional csv) (short 'H' <> long "header" <> value headerOverride + <> metavar "COLUMN_NAMES" + <> help "comma-separated names to use for the columns (can be empty to unselect a column)" + ) <*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation + <> metavar "PUNCTUATION_CHARACTERS" <> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)" ) <*> switch (short 's' <> long "strictTEI" <> help "only use TEI's att.linguistic on the elements" ) where - Config {formColumn, punctuation} = defaultConfig + Config {formColumn, headerOverride, punctuation} = defaultConfig command :: Parser Command command = Command diff --git a/lib/Text/TEIWA/Config.hs b/lib/Text/TEIWA/Config.hs index f863274..457c34c 100644 --- a/lib/Text/TEIWA/Config.hs +++ b/lib/Text/TEIWA/Config.hs @@ -8,6 +8,7 @@ import Data.Text.Lazy (Text) data Config = Config { formColumn :: Maybe Text + , headerOverride :: Maybe [Text] , punctuation :: Char -> Bool , strictTEI :: Bool } @@ -15,6 +16,7 @@ data Config = Config { defaultConfig :: Config defaultConfig = Config { formColumn = Nothing + , headerOverride = Nothing , punctuation = isPunctuation , strictTEI = False } diff --git a/lib/Text/TEIWA/Error.hs b/lib/Text/TEIWA/Error.hs index e5b4b59..2cb5940 100644 --- a/lib/Text/TEIWA/Error.hs +++ b/lib/Text/TEIWA/Error.hs @@ -7,14 +7,18 @@ import Text.Printf (printf) data Error = NoSuchColumn String + | NoFormColumn | EmptyHeader | MissingColumn Line String | ParsingError ParseError | TermNotFound String instance Show Error where - show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s - show EmptyHeader = printf "The CSV file header is empty" + show (NoSuchColumn s) = + printf "\"%s\" isn't a valid column name in this file" s + show NoFormColumn = + "No \"form\" column has been found, select one with --formColumn" + show EmptyHeader = "The CSV file header is empty" show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s show (ParsingError e) = show e show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 1a1db4d..6c906f7 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -1,9 +1,9 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} module Text.TEIWA.Source ( Format , Origin(..) - , TEIWAParser , Source(..) , coNLLX , csv @@ -13,48 +13,62 @@ module Text.TEIWA.Source ( ) where import Control.Monad.Except (MonadError(..)) +import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) -import Data.Text.Lazy as Text (Text, unpack) +import Data.Text.Lazy as Text (Text, toLower, unpack) import Data.Text.Lazy.IO as Text (readFile) -import Text.Parsec (ParsecT, SourceName, runParserT) +import Text.Parsec (SourceName, runParserT) import Text.TEIWA.Annotation ( Annotation(..), SentenceAnnotation(..), TokenAnnotation(..) ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common (AnnotationContext(..), Row) -import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences) -import qualified Text.TEIWA.Source.CSV as CSV (body, getContext) +import Text.TEIWA.Source.Common ( + AnnotationContext(..), Field, Row, TEIWAParser, annotationContext + ) +import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences) +import qualified Text.TEIWA.Source.SSV as SSV (body, fields) + +type Format = TEIWAParser Annotation -type TEIWAParser = ParsecT Text () (Either Error) -type Format = Config -> TEIWAParser Annotation +isTEI :: Field -> Bool +isTEI = (`elem` ["pos", "lemma", "msd"]) -annotateToken :: MonadError Error m => +filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)] +filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI + where + filterIf _ ("", _) = [] + filterIf False (k, v) = [(k, v)] + filterIf True (k, v) = + let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] + +annotateToken :: (MonadError Error m, MonadReader Config m) => AnnotationContext -> Row -> m TokenAnnotation annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = case splitAt columnIndex record of (_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName (before, form:after) -> - return $ TokenAnnotation {form, annotated = zip header (before ++ after)} + TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) coNLLX :: Format -coNLLX (Config {formColumn}) = do - context <- ConLLX.getContext formColumn +coNLLX = do + context <- annotationContext CoNLLX.fields SentenceLevel <$> ( - ConLLX.sentences >>= mapM ( + CoNLLX.sentences >>= mapM ( fmap SentenceAnnotation . mapM (annotateToken context) ) ) +ssv :: Char -> Format +ssv separator = do + context <- annotationContext =<< SSV.fields separator + TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) + csv :: Format -csv (Config {formColumn}) = do - context <- CSV.getContext ',' formColumn - TokenLevel <$> (CSV.body ',' >>= mapM (annotateToken context)) +csv = ssv ',' tsv :: Format -tsv (Config {formColumn}) = do - context <- CSV.getContext '\t' formColumn - TokenLevel <$> (CSV.body '\t' >>= mapM (annotateToken context)) +tsv = ssv '\t' data Origin = File FilePath | Text Text @@ -64,14 +78,14 @@ data Source = Source { } runTEIWAParser :: MonadError Error m => - TEIWAParser a -> SourceName -> Text -> m a -runTEIWAParser p s = flattenErrors . runParserT p () s + Config -> TEIWAParser a -> SourceName -> Text -> m a +runTEIWAParser config p s = flattenErrors . (`runReaderT` config) . runParserT p () s where flattenErrors = either throwError (either (throwError . ParsingError) pure) parse :: (MonadIO m, MonadError Error m) => Config -> Source -> m Annotation -parse config (Source {format, origin}) = parseFrom (format config) origin +parse config (Source {format, origin}) = parseFrom format origin where - parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser p f - parseFrom p (Text t) = runTEIWAParser p "" t + parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser config p f + parseFrom p (Text t) = runTEIWAParser config p "" t diff --git a/lib/Text/TEIWA/Source/CSV.hs b/lib/Text/TEIWA/Source/CSV.hs deleted file mode 100644 index eed165c..0000000 --- a/lib/Text/TEIWA/Source/CSV.hs +++ /dev/null @@ -1,44 +0,0 @@ -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE FlexibleContexts #-} -module Text.TEIWA.Source.CSV ( - body - , getContext - ) where - -import Control.Applicative ((<|>), many) -import Control.Monad.Except (MonadError(..)) -import Data.Text.Lazy as Text (Text, pack, unpack) -import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) -import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common ( - AnnotationContext(..), Field, Row, eol, recordLine - ) - -field :: Stream s m Char => Char -> ParsecT s u m Field -field separator = Text.pack <$> (regular <|> quoted) - where - regular = many (noneOf $ separator:"\n\r\"") - quoted = between quote quote $ - many (noneOf "\"" <|> try (string "\"\"" *> pure '"')) - quote = char '"' - -fields :: Stream s m Char => Char -> ParsecT s u m [Field] -fields separator = (field separator `sepBy` char separator) <* eol - -body :: Stream s m Char => Char -> ParsecT s u m [Row] -body = many . recordLine . fields - -getContext :: (Stream s m Char, MonadError Error m) => - Char -> Maybe Text -> ParsecT s u m AnnotationContext -getContext separator formColumn = fields separator >>= aux formColumn - where - aux Nothing [] = throwError EmptyHeader - aux Nothing (columnName:header) = - return $ AnnotationContext {columnIndex = 0, columnName, header} - aux (Just columnName) headerRecord = - case break (== columnName) headerRecord of - (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName - (before, _:after) -> - let columnIndex = length before in - let header = (before ++ after) in - return $ AnnotationContext {columnIndex, columnName, header} diff --git a/lib/Text/TEIWA/Source/ConLLX.hs b/lib/Text/TEIWA/Source/CoNLLX.hs similarity index 50% rename from lib/Text/TEIWA/Source/ConLLX.hs rename to lib/Text/TEIWA/Source/CoNLLX.hs index 2a04a0e..5f2d517 100644 --- a/lib/Text/TEIWA/Source/ConLLX.hs +++ b/lib/Text/TEIWA/Source/CoNLLX.hs @@ -1,19 +1,14 @@ -{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} -module Text.TEIWA.Source.ConLLX ( - getContext +module Text.TEIWA.Source.CoNLLX ( + fields , sentences ) where import Control.Applicative (many) -import Control.Monad.Except (MonadError(..)) -import Data.Text.Lazy as Text (Text, pack, unpack) +import Data.Text.Lazy as Text (pack) import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1) -import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common ( - AnnotationContext(..), Field, Row, eol, recordLine - ) +import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) fields :: [Field] fields = [ @@ -44,17 +39,3 @@ sentence = many comment *> many1 row sentences :: Stream s m Char => ParsecT s u m [Sentence] sentences = many (sentence <* many1 eol) - -getContext :: MonadError Error m => Maybe Text -> m AnnotationContext -getContext Nothing = return $ AnnotationContext { - columnIndex = 1 - , columnName = "FORM" - , header = take 1 fields ++ drop 2 fields - } -getContext (Just columnName) = - case break (== columnName) fields of - (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName - (before, _:after) -> - let columnIndex = length before in - let header = (before ++ after) in - return $ AnnotationContext {columnIndex, columnName, header} diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index efb5688..83839e9 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -1,18 +1,27 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} module Text.TEIWA.Source.Common ( AnnotationContext(..) , Field , Row + , TEIWAParser + , annotationContext , eol , recordLine ) where import Control.Applicative ((<|>)) -import Data.Text.Lazy as Text (Text) +import Control.Monad.Except (MonadError(..)) +import Control.Monad.Reader (MonadReader(..), ReaderT) +import Data.List (findIndex) +import Data.Text.Lazy as Text (Text, toLower, unpack) import Text.Parsec ( Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos , try ) +import Text.TEIWA.Config (Config(..)) +import Text.TEIWA.Error (Error(..)) data AnnotationContext = AnnotationContext { columnIndex :: Column @@ -24,6 +33,8 @@ type Field = Text type Header = [Text] type Row = (Line, [Field]) +type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) + eol :: Stream s m Char => ParsecT s u m () eol = (try endOfLine <|> char '\r') *> return () @@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a) recordLine p = (,) <$> currentLine <*> p where currentLine = sourceLine . statePos <$> getParserState + +annotationContext :: Header -> TEIWAParser AnnotationContext +annotationContext defaultFields = do + Config {formColumn, headerOverride} <- ask + --headerFields <- maybe defaultFields pure headerOverride + let headerFields = maybe defaultFields id headerOverride + column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields) + buildContext column headerFields + where + findForm = fmap Left . findIndex ((==) "form" . Text.toLower) + catchNothing = maybe (throwError NoFormColumn) return + +buildContext :: MonadError Error m => + Either Int Text -> Header -> m AnnotationContext +buildContext column = build . either splitAt (break . (==)) column + where + build (_, []) = + throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column + build (before, columnName:after) = + let columnIndex = length before in + let header = (before ++ after) in + return $ AnnotationContext {columnIndex, columnName, header} diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs new file mode 100644 index 0000000..43c9e3c --- /dev/null +++ b/lib/Text/TEIWA/Source/SSV.hs @@ -0,0 +1,24 @@ +{-# LANGUAGE FlexibleContexts #-} +module Text.TEIWA.Source.SSV ( + body + , fields + ) where + +import Control.Applicative ((<|>), many) +import Data.Text.Lazy as Text (pack) +import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) +import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) + +field :: Stream s m Char => Char -> ParsecT s u m Field +field separator = Text.pack <$> (regular <|> quoted) + where + regular = many (noneOf $ separator:"\n\r\"") + quoted = between quote quote $ + many (noneOf "\"" <|> try (string "\"\"" *> pure '"')) + quote = char '"' + +fields :: Stream s m Char => Char -> ParsecT s u m [Field] +fields separator = (field separator `sepBy` char separator) <* eol + +body :: Stream s m Char => Char -> ParsecT s u m [Row] +body = many . recordLine . fields diff --git a/teiwa.cabal b/teiwa.cabal index 01eb1bc..8e84544 100644 --- a/teiwa.cabal +++ b/teiwa.cabal @@ -29,8 +29,8 @@ library , Text.TEIWA.Error , Text.TEIWA.Source , Text.TEIWA.Source.Common - , Text.TEIWA.Source.ConLLX - , Text.TEIWA.Source.CSV + , Text.TEIWA.Source.CoNLLX + , Text.TEIWA.Source.SSV build-depends: base >=4.12 && <4.15 , bytestring , mtl -- GitLab