Skip to content
Snippets Groups Projects
Commit f22f1393 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add support for --strictTEI option to enforce the use of TEI's LLA attributes only

parent 92ca859d
No related branches found
No related tags found
No related merge requests found
...@@ -4,6 +4,7 @@ module CLI ( ...@@ -4,6 +4,7 @@ module CLI (
, getCommand , getCommand
) where ) where
import Data.Text.Lazy as Text (Text, split)
import Data.Version (showVersion) import Data.Version (showVersion)
import Control.Applicative ((<*>), optional) import Control.Applicative ((<*>), optional)
import Options.Applicative ( import Options.Applicative (
...@@ -21,19 +22,29 @@ data Command = Command { ...@@ -21,19 +22,29 @@ data Command = Command {
charPredicate :: ReadM (Char -> Bool) charPredicate :: ReadM (Char -> Bool)
charPredicate = flip elem <$> (str :: ReadM String) charPredicate = flip elem <$> (str :: ReadM String)
csv :: ReadM [Text]
csv = split (== ',') <$> str
configOptions :: Parser Config configOptions :: Parser Config
configOptions = Config configOptions = Config
<$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn <$> option (optional str) (
short 'c' <> long "formColumn" <> value formColumn
<> metavar "COLUMN_NAME"
<> help "the column to use as the form" <> help "the column to use as the form"
) )
<*> option (optional csv) (short 'H' <> long "header" <> value headerOverride
<> metavar "COLUMN_NAMES"
<> help "comma-separated names to use for the columns (can be empty to unselect a column)"
)
<*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation <*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation
<> metavar "PUNCTUATION_CHARACTERS"
<> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)" <> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)"
) )
<*> switch (short 's' <> long "strictTEI" <*> switch (short 's' <> long "strictTEI"
<> help "only use TEI's att.linguistic on the elements" <> help "only use TEI's att.linguistic on the elements"
) )
where where
Config {formColumn, punctuation} = defaultConfig Config {formColumn, headerOverride, punctuation} = defaultConfig
command :: Parser Command command :: Parser Command
command = Command command = Command
......
...@@ -8,6 +8,7 @@ import Data.Text.Lazy (Text) ...@@ -8,6 +8,7 @@ import Data.Text.Lazy (Text)
data Config = Config { data Config = Config {
formColumn :: Maybe Text formColumn :: Maybe Text
, headerOverride :: Maybe [Text]
, punctuation :: Char -> Bool , punctuation :: Char -> Bool
, strictTEI :: Bool , strictTEI :: Bool
} }
...@@ -15,6 +16,7 @@ data Config = Config { ...@@ -15,6 +16,7 @@ data Config = Config {
defaultConfig :: Config defaultConfig :: Config
defaultConfig = Config { defaultConfig = Config {
formColumn = Nothing formColumn = Nothing
, headerOverride = Nothing
, punctuation = isPunctuation , punctuation = isPunctuation
, strictTEI = False , strictTEI = False
} }
...@@ -7,14 +7,18 @@ import Text.Printf (printf) ...@@ -7,14 +7,18 @@ import Text.Printf (printf)
data Error = data Error =
NoSuchColumn String NoSuchColumn String
| NoFormColumn
| EmptyHeader | EmptyHeader
| MissingColumn Line String | MissingColumn Line String
| ParsingError ParseError | ParsingError ParseError
| TermNotFound String | TermNotFound String
instance Show Error where instance Show Error where
show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s show (NoSuchColumn s) =
show EmptyHeader = printf "The CSV file header is empty" printf "\"%s\" isn't a valid column name in this file" s
show NoFormColumn =
"No \"form\" column has been found, select one with --formColumn"
show EmptyHeader = "The CSV file header is empty"
show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s
show (ParsingError e) = show e show (ParsingError e) = show e
show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.TEIWA.Source ( module Text.TEIWA.Source (
Format Format
, Origin(..) , Origin(..)
, TEIWAParser
, Source(..) , Source(..)
, coNLLX , coNLLX
, csv , csv
...@@ -13,48 +13,62 @@ module Text.TEIWA.Source ( ...@@ -13,48 +13,62 @@ module Text.TEIWA.Source (
) where ) where
import Control.Monad.Except (MonadError(..)) import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT(..))
import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.IO.Class (MonadIO(..))
import Data.Text.Lazy as Text (Text, unpack) import Data.Text.Lazy as Text (Text, toLower, unpack)
import Data.Text.Lazy.IO as Text (readFile) import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (ParsecT, SourceName, runParserT) import Text.Parsec (SourceName, runParserT)
import Text.TEIWA.Annotation ( import Text.TEIWA.Annotation (
Annotation(..), SentenceAnnotation(..), TokenAnnotation(..) Annotation(..), SentenceAnnotation(..), TokenAnnotation(..)
) )
import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (AnnotationContext(..), Row) import Text.TEIWA.Source.Common (
import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences) AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
import qualified Text.TEIWA.Source.CSV as CSV (body, getContext) )
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, fields)
type Format = TEIWAParser Annotation
type TEIWAParser = ParsecT Text () (Either Error) isTEI :: Field -> Bool
type Format = Config -> TEIWAParser Annotation isTEI = (`elem` ["pos", "lemma", "msd"])
annotateToken :: MonadError Error m => filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)]
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where
filterIf _ ("", _) = []
filterIf False (k, v) = [(k, v)]
filterIf True (k, v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
annotateToken :: (MonadError Error m, MonadReader Config m) =>
AnnotationContext -> Row -> m TokenAnnotation AnnotationContext -> Row -> m TokenAnnotation
annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
case splitAt columnIndex record of case splitAt columnIndex record of
(_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName (_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName
(before, form:after) -> (before, form:after) ->
return $ TokenAnnotation {form, annotated = zip header (before ++ after)} TokenAnnotation form <$> filterAttributes (zip header $ before ++ after)
coNLLX :: Format coNLLX :: Format
coNLLX (Config {formColumn}) = do coNLLX = do
context <- ConLLX.getContext formColumn context <- annotationContext CoNLLX.fields
SentenceLevel <$> ( SentenceLevel <$> (
ConLLX.sentences >>= mapM ( CoNLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context) fmap SentenceAnnotation . mapM (annotateToken context)
) )
) )
ssv :: Char -> Format
ssv separator = do
context <- annotationContext =<< SSV.fields separator
TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
csv :: Format csv :: Format
csv (Config {formColumn}) = do csv = ssv ','
context <- CSV.getContext ',' formColumn
TokenLevel <$> (CSV.body ',' >>= mapM (annotateToken context))
tsv :: Format tsv :: Format
tsv (Config {formColumn}) = do tsv = ssv '\t'
context <- CSV.getContext '\t' formColumn
TokenLevel <$> (CSV.body '\t' >>= mapM (annotateToken context))
data Origin = File FilePath | Text Text data Origin = File FilePath | Text Text
...@@ -64,14 +78,14 @@ data Source = Source { ...@@ -64,14 +78,14 @@ data Source = Source {
} }
runTEIWAParser :: MonadError Error m => runTEIWAParser :: MonadError Error m =>
TEIWAParser a -> SourceName -> Text -> m a Config -> TEIWAParser a -> SourceName -> Text -> m a
runTEIWAParser p s = flattenErrors . runParserT p () s runTEIWAParser config p s = flattenErrors . (`runReaderT` config) . runParserT p () s
where where
flattenErrors = either throwError (either (throwError . ParsingError) pure) flattenErrors = either throwError (either (throwError . ParsingError) pure)
parse :: (MonadIO m, MonadError Error m) => parse :: (MonadIO m, MonadError Error m) =>
Config -> Source -> m Annotation Config -> Source -> m Annotation
parse config (Source {format, origin}) = parseFrom (format config) origin parse config (Source {format, origin}) = parseFrom format origin
where where
parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser p f parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser config p f
parseFrom p (Text t) = runTEIWAParser p "" t parseFrom p (Text t) = runTEIWAParser config p "" t
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.ConLLX ( module Text.TEIWA.Source.CoNLLX (
getContext fields
, sentences , sentences
) where ) where
import Control.Applicative (many) import Control.Applicative (many)
import Control.Monad.Except (MonadError(..)) import Data.Text.Lazy as Text (pack)
import Data.Text.Lazy as Text (Text, pack, unpack)
import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1) import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1)
import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
fields :: [Field] fields :: [Field]
fields = [ fields = [
...@@ -44,17 +39,3 @@ sentence = many comment *> many1 row ...@@ -44,17 +39,3 @@ sentence = many comment *> many1 row
sentences :: Stream s m Char => ParsecT s u m [Sentence] sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (sentence <* many1 eol) sentences = many (sentence <* many1 eol)
getContext :: MonadError Error m => Maybe Text -> m AnnotationContext
getContext Nothing = return $ AnnotationContext {
columnIndex = 1
, columnName = "FORM"
, header = take 1 fields ++ drop 2 fields
}
getContext (Just columnName) =
case break (== columnName) fields of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.TEIWA.Source.Common ( module Text.TEIWA.Source.Common (
AnnotationContext(..) AnnotationContext(..)
, Field , Field
, Row , Row
, TEIWAParser
, annotationContext
, eol , eol
, recordLine , recordLine
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Data.Text.Lazy as Text (Text) import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT)
import Data.List (findIndex)
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Text.Parsec ( import Text.Parsec (
Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos
, try , try
) )
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
data AnnotationContext = AnnotationContext { data AnnotationContext = AnnotationContext {
columnIndex :: Column columnIndex :: Column
...@@ -24,6 +33,8 @@ type Field = Text ...@@ -24,6 +33,8 @@ type Field = Text
type Header = [Text] type Header = [Text]
type Row = (Line, [Field]) type Row = (Line, [Field])
type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error))
eol :: Stream s m Char => ParsecT s u m () eol :: Stream s m Char => ParsecT s u m ()
eol = (try endOfLine <|> char '\r') *> return () eol = (try endOfLine <|> char '\r') *> return ()
...@@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a) ...@@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a)
recordLine p = (,) <$> currentLine <*> p recordLine p = (,) <$> currentLine <*> p
where where
currentLine = sourceLine . statePos <$> getParserState currentLine = sourceLine . statePos <$> getParserState
annotationContext :: Header -> TEIWAParser AnnotationContext
annotationContext defaultFields = do
Config {formColumn, headerOverride} <- ask
--headerFields <- maybe defaultFields pure headerOverride
let headerFields = maybe defaultFields id headerOverride
column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields)
buildContext column headerFields
where
findForm = fmap Left . findIndex ((==) "form" . Text.toLower)
catchNothing = maybe (throwError NoFormColumn) return
buildContext :: MonadError Error m =>
Either Int Text -> Header -> m AnnotationContext
buildContext column = build . either splitAt (break . (==)) column
where
build (_, []) =
throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column
build (before, columnName:after) =
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.CSV ( module Text.TEIWA.Source.SSV (
body body
, getContext , fields
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..)) import Data.Text.Lazy as Text (pack)
import Data.Text.Lazy as Text (Text, pack, unpack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
field :: Stream s m Char => Char -> ParsecT s u m Field field :: Stream s m Char => Char -> ParsecT s u m Field
field separator = Text.pack <$> (regular <|> quoted) field separator = Text.pack <$> (regular <|> quoted)
...@@ -27,18 +22,3 @@ fields separator = (field separator `sepBy` char separator) <* eol ...@@ -27,18 +22,3 @@ fields separator = (field separator `sepBy` char separator) <* eol
body :: Stream s m Char => Char -> ParsecT s u m [Row] body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields body = many . recordLine . fields
getContext :: (Stream s m Char, MonadError Error m) =>
Char -> Maybe Text -> ParsecT s u m AnnotationContext
getContext separator formColumn = fields separator >>= aux formColumn
where
aux Nothing [] = throwError EmptyHeader
aux Nothing (columnName:header) =
return $ AnnotationContext {columnIndex = 0, columnName, header}
aux (Just columnName) headerRecord =
case break (== columnName) headerRecord of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
...@@ -29,8 +29,8 @@ library ...@@ -29,8 +29,8 @@ library
, Text.TEIWA.Error , Text.TEIWA.Error
, Text.TEIWA.Source , Text.TEIWA.Source
, Text.TEIWA.Source.Common , Text.TEIWA.Source.Common
, Text.TEIWA.Source.ConLLX , Text.TEIWA.Source.CoNLLX
, Text.TEIWA.Source.CSV , Text.TEIWA.Source.SSV
build-depends: base >=4.12 && <4.15 build-depends: base >=4.12 && <4.15
, bytestring , bytestring
, mtl , mtl
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment