diff --git a/app/CLI.hs b/app/CLI.hs index 8d3525861e024f6bc1b4267a2c892756ab0a4c41..7117008ce7e5ee7ee4211602c1f95fbaba40b6b8 100644 --- a/app/CLI.hs +++ b/app/CLI.hs @@ -4,6 +4,7 @@ module CLI ( , getCommand ) where +import Data.Text.Lazy as Text (Text, split) import Data.Version (showVersion) import Control.Applicative ((<*>), optional) import Options.Applicative ( @@ -21,19 +22,29 @@ data Command = Command { charPredicate :: ReadM (Char -> Bool) charPredicate = flip elem <$> (str :: ReadM String) +csv :: ReadM [Text] +csv = split (== ',') <$> str + configOptions :: Parser Config configOptions = Config - <$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn + <$> option (optional str) ( + short 'c' <> long "formColumn" <> value formColumn + <> metavar "COLUMN_NAME" <> help "the column to use as the form" ) + <*> option (optional csv) (short 'H' <> long "header" <> value headerOverride + <> metavar "COLUMN_NAMES" + <> help "comma-separated names to use for the columns (can be empty to unselect a column)" + ) <*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation + <> metavar "PUNCTUATION_CHARACTERS" <> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)" ) <*> switch (short 's' <> long "strictTEI" <> help "only use TEI's att.linguistic on the elements" ) where - Config {formColumn, punctuation} = defaultConfig + Config {formColumn, headerOverride, punctuation} = defaultConfig command :: Parser Command command = Command diff --git a/lib/Text/TEIWA/Config.hs b/lib/Text/TEIWA/Config.hs index f863274178831f0b837ce9a23c60a23df15aca9c..457c34c74d1024c2e53de7350be3c0891e6d65ec 100644 --- a/lib/Text/TEIWA/Config.hs +++ b/lib/Text/TEIWA/Config.hs @@ -8,6 +8,7 @@ import Data.Text.Lazy (Text) data Config = Config { formColumn :: Maybe Text + , headerOverride :: Maybe [Text] , punctuation :: Char -> Bool , strictTEI :: Bool } @@ -15,6 +16,7 @@ data Config = Config { defaultConfig :: Config defaultConfig = Config { formColumn = Nothing + , headerOverride = Nothing , punctuation = isPunctuation , strictTEI = False } diff --git a/lib/Text/TEIWA/Error.hs b/lib/Text/TEIWA/Error.hs index e5b4b5968ad5f1aba928f66a6a0fce4f520ba0c1..2cb59403148ae3a1dcd177ed6a79b41fd46fcd81 100644 --- a/lib/Text/TEIWA/Error.hs +++ b/lib/Text/TEIWA/Error.hs @@ -7,14 +7,18 @@ import Text.Printf (printf) data Error = NoSuchColumn String + | NoFormColumn | EmptyHeader | MissingColumn Line String | ParsingError ParseError | TermNotFound String instance Show Error where - show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s - show EmptyHeader = printf "The CSV file header is empty" + show (NoSuchColumn s) = + printf "\"%s\" isn't a valid column name in this file" s + show NoFormColumn = + "No \"form\" column has been found, select one with --formColumn" + show EmptyHeader = "The CSV file header is empty" show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s show (ParsingError e) = show e show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 48b6fb92f5bbad8131103a9ef6da52d60cae1a1b..53a74038ad902a590fa6a146039e512bed1ed504 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -4,7 +4,6 @@ module Text.TEIWA.Source ( Format , Origin(..) - , TEIWAParser , Source(..) , coNLLX , csv @@ -18,28 +17,30 @@ import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) import Data.Text.Lazy as Text (Text, toLower, unpack) import Data.Text.Lazy.IO as Text (readFile) -import Text.Parsec (ParsecT, SourceName, runParserT) +import Text.Parsec (SourceName, runParserT) import Text.TEIWA.Annotation ( Annotation(..), SentenceAnnotation(..), TokenAnnotation(..) ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common (AnnotationContext(..), Field, Row) -import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences) -import qualified Text.TEIWA.Source.SSV as SSV (body, getContext) +import Text.TEIWA.Source.Common ( + AnnotationContext(..), Field, Row, TEIWAParser, annotationContext + ) +import qualified Text.TEIWA.Source.ConLLX as ConLLX (fields, sentences) +import qualified Text.TEIWA.Source.SSV as SSV (body, fields) -type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) type Format = TEIWAParser Annotation isTEI :: Field -> Bool isTEI = (`elem` ["pos", "lemma", "msd"]) filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)] -filterAttributes attributes = reader $ filterIf . strictTEI +filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI where - filterIf False = attributes - filterIf True = - [(k', v) | (k, v) <- attributes, let k' = Text.toLower k, isTEI k'] + filterIf _ ("", _) = [] + filterIf False (k, v) = [(k, v)] + filterIf True (k, v) = + let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] annotateToken :: (MonadError Error m, MonadReader Config m) => AnnotationContext -> Row -> m TokenAnnotation @@ -51,7 +52,7 @@ annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, rec coNLLX :: Format coNLLX = do - context <- ConLLX.getContext =<< reader formColumn + context <- annotationContext ConLLX.fields SentenceLevel <$> ( ConLLX.sentences >>= mapM ( fmap SentenceAnnotation . mapM (annotateToken context) @@ -60,7 +61,7 @@ coNLLX = do ssv :: Char -> Format ssv separator = do - context <- SSV.getContext separator =<< reader formColumn + context <- annotationContext =<< SSV.fields separator TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) csv :: Format diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index efb568893dbb14913b1d293c0585d16c4fcceefb..83839e9a5463ef6fc5372ca1e64221655c196605 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -1,18 +1,27 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} module Text.TEIWA.Source.Common ( AnnotationContext(..) , Field , Row + , TEIWAParser + , annotationContext , eol , recordLine ) where import Control.Applicative ((<|>)) -import Data.Text.Lazy as Text (Text) +import Control.Monad.Except (MonadError(..)) +import Control.Monad.Reader (MonadReader(..), ReaderT) +import Data.List (findIndex) +import Data.Text.Lazy as Text (Text, toLower, unpack) import Text.Parsec ( Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos , try ) +import Text.TEIWA.Config (Config(..)) +import Text.TEIWA.Error (Error(..)) data AnnotationContext = AnnotationContext { columnIndex :: Column @@ -24,6 +33,8 @@ type Field = Text type Header = [Text] type Row = (Line, [Field]) +type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) + eol :: Stream s m Char => ParsecT s u m () eol = (try endOfLine <|> char '\r') *> return () @@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a) recordLine p = (,) <$> currentLine <*> p where currentLine = sourceLine . statePos <$> getParserState + +annotationContext :: Header -> TEIWAParser AnnotationContext +annotationContext defaultFields = do + Config {formColumn, headerOverride} <- ask + --headerFields <- maybe defaultFields pure headerOverride + let headerFields = maybe defaultFields id headerOverride + column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields) + buildContext column headerFields + where + findForm = fmap Left . findIndex ((==) "form" . Text.toLower) + catchNothing = maybe (throwError NoFormColumn) return + +buildContext :: MonadError Error m => + Either Int Text -> Header -> m AnnotationContext +buildContext column = build . either splitAt (break . (==)) column + where + build (_, []) = + throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column + build (before, columnName:after) = + let columnIndex = length before in + let header = (before ++ after) in + return $ AnnotationContext {columnIndex, columnName, header} diff --git a/lib/Text/TEIWA/Source/ConLLX.hs b/lib/Text/TEIWA/Source/ConLLX.hs index 2a04a0e7fbf61aba2c8012a2f02903476534b6c0..fb0f1d470e57878c86a1c7a69dc78b5fd0304369 100644 --- a/lib/Text/TEIWA/Source/ConLLX.hs +++ b/lib/Text/TEIWA/Source/ConLLX.hs @@ -1,19 +1,14 @@ -{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} module Text.TEIWA.Source.ConLLX ( - getContext + fields , sentences ) where import Control.Applicative (many) -import Control.Monad.Except (MonadError(..)) -import Data.Text.Lazy as Text (Text, pack, unpack) +import Data.Text.Lazy as Text (pack) import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1) -import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common ( - AnnotationContext(..), Field, Row, eol, recordLine - ) +import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) fields :: [Field] fields = [ @@ -44,17 +39,3 @@ sentence = many comment *> many1 row sentences :: Stream s m Char => ParsecT s u m [Sentence] sentences = many (sentence <* many1 eol) - -getContext :: MonadError Error m => Maybe Text -> m AnnotationContext -getContext Nothing = return $ AnnotationContext { - columnIndex = 1 - , columnName = "FORM" - , header = take 1 fields ++ drop 2 fields - } -getContext (Just columnName) = - case break (== columnName) fields of - (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName - (before, _:after) -> - let columnIndex = length before in - let header = (before ++ after) in - return $ AnnotationContext {columnIndex, columnName, header} diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs index 12187018f0fa04625e96d12a9998345949026604..43c9e3cbf61f16aedadba5b06df64f63df1b0b10 100644 --- a/lib/Text/TEIWA/Source/SSV.hs +++ b/lib/Text/TEIWA/Source/SSV.hs @@ -1,18 +1,13 @@ -{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleContexts #-} module Text.TEIWA.Source.SSV ( body - , getContext + , fields ) where import Control.Applicative ((<|>), many) -import Control.Monad.Except (MonadError(..)) -import Data.Text.Lazy as Text (Text, pack, unpack) +import Data.Text.Lazy as Text (pack) import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) -import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common ( - AnnotationContext(..), Field, Row, eol, recordLine - ) +import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) field :: Stream s m Char => Char -> ParsecT s u m Field field separator = Text.pack <$> (regular <|> quoted) @@ -27,18 +22,3 @@ fields separator = (field separator `sepBy` char separator) <* eol body :: Stream s m Char => Char -> ParsecT s u m [Row] body = many . recordLine . fields - -getContext :: (Stream s m Char, MonadError Error m) => - Char -> Maybe Text -> ParsecT s u m AnnotationContext -getContext separator formColumn = fields separator >>= aux formColumn - where - aux Nothing [] = throwError EmptyHeader - aux Nothing (columnName:header) = - return $ AnnotationContext {columnIndex = 0, columnName, header} - aux (Just columnName) headerRecord = - case break (== columnName) headerRecord of - (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName - (before, _:after) -> - let columnIndex = length before in - let header = (before ++ after) in - return $ AnnotationContext {columnIndex, columnName, header}