Skip to content
Snippets Groups Projects
Commit a0fd2374 authored by Alice Brenon's avatar Alice Brenon
Browse files

Implement custom headers to select columns in CoNLL-X

parent 09f7f057
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,7 @@ module CLI (
, getCommand
) where
import Data.Text.Lazy as Text (Text, split)
import Data.Version (showVersion)
import Control.Applicative ((<*>), optional)
import Options.Applicative (
......@@ -21,19 +22,29 @@ data Command = Command {
charPredicate :: ReadM (Char -> Bool)
charPredicate = flip elem <$> (str :: ReadM String)
csv :: ReadM [Text]
csv = split (== ',') <$> str
configOptions :: Parser Config
configOptions = Config
<$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn
<$> option (optional str) (
short 'c' <> long "formColumn" <> value formColumn
<> metavar "COLUMN_NAME"
<> help "the column to use as the form"
)
<*> option (optional csv) (short 'H' <> long "header" <> value headerOverride
<> metavar "COLUMN_NAMES"
<> help "comma-separated names to use for the columns (can be empty to unselect a column)"
)
<*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation
<> metavar "PUNCTUATION_CHARACTERS"
<> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)"
)
<*> switch (short 's' <> long "strictTEI"
<> help "only use TEI's att.linguistic on the elements"
)
where
Config {formColumn, punctuation} = defaultConfig
Config {formColumn, headerOverride, punctuation} = defaultConfig
command :: Parser Command
command = Command
......
......@@ -8,6 +8,7 @@ import Data.Text.Lazy (Text)
data Config = Config {
formColumn :: Maybe Text
, headerOverride :: Maybe [Text]
, punctuation :: Char -> Bool
, strictTEI :: Bool
}
......@@ -15,6 +16,7 @@ data Config = Config {
defaultConfig :: Config
defaultConfig = Config {
formColumn = Nothing
, headerOverride = Nothing
, punctuation = isPunctuation
, strictTEI = False
}
......@@ -7,14 +7,18 @@ import Text.Printf (printf)
data Error =
NoSuchColumn String
| NoFormColumn
| EmptyHeader
| MissingColumn Line String
| ParsingError ParseError
| TermNotFound String
instance Show Error where
show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s
show EmptyHeader = printf "The CSV file header is empty"
show (NoSuchColumn s) =
printf "\"%s\" isn't a valid column name in this file" s
show NoFormColumn =
"No \"form\" column has been found, select one with --formColumn"
show EmptyHeader = "The CSV file header is empty"
show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s
show (ParsingError e) = show e
show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
......@@ -4,7 +4,6 @@
module Text.TEIWA.Source (
Format
, Origin(..)
, TEIWAParser
, Source(..)
, coNLLX
, csv
......@@ -18,28 +17,30 @@ import Control.Monad.Reader (MonadReader(..), ReaderT(..))
import Control.Monad.IO.Class (MonadIO(..))
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (ParsecT, SourceName, runParserT)
import Text.Parsec (SourceName, runParserT)
import Text.TEIWA.Annotation (
Annotation(..), SentenceAnnotation(..), TokenAnnotation(..)
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (AnnotationContext(..), Field, Row)
import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, getContext)
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
)
import qualified Text.TEIWA.Source.ConLLX as ConLLX (fields, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, fields)
type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error))
type Format = TEIWAParser Annotation
isTEI :: Field -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"])
filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)]
filterAttributes attributes = reader $ filterIf . strictTEI
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where
filterIf False = attributes
filterIf True =
[(k', v) | (k, v) <- attributes, let k' = Text.toLower k, isTEI k']
filterIf _ ("", _) = []
filterIf False (k, v) = [(k, v)]
filterIf True (k, v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
annotateToken :: (MonadError Error m, MonadReader Config m) =>
AnnotationContext -> Row -> m TokenAnnotation
......@@ -51,7 +52,7 @@ annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, rec
coNLLX :: Format
coNLLX = do
context <- ConLLX.getContext =<< reader formColumn
context <- annotationContext ConLLX.fields
SentenceLevel <$> (
ConLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
......@@ -60,7 +61,7 @@ coNLLX = do
ssv :: Char -> Format
ssv separator = do
context <- SSV.getContext separator =<< reader formColumn
context <- annotationContext =<< SSV.fields separator
TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
csv :: Format
......
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.TEIWA.Source.Common (
AnnotationContext(..)
, Field
, Row
, TEIWAParser
, annotationContext
, eol
, recordLine
) where
import Control.Applicative ((<|>))
import Data.Text.Lazy as Text (Text)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT)
import Data.List (findIndex)
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Text.Parsec (
Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos
, try
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
data AnnotationContext = AnnotationContext {
columnIndex :: Column
......@@ -24,6 +33,8 @@ type Field = Text
type Header = [Text]
type Row = (Line, [Field])
type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error))
eol :: Stream s m Char => ParsecT s u m ()
eol = (try endOfLine <|> char '\r') *> return ()
......@@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a)
recordLine p = (,) <$> currentLine <*> p
where
currentLine = sourceLine . statePos <$> getParserState
annotationContext :: Header -> TEIWAParser AnnotationContext
annotationContext defaultFields = do
Config {formColumn, headerOverride} <- ask
--headerFields <- maybe defaultFields pure headerOverride
let headerFields = maybe defaultFields id headerOverride
column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields)
buildContext column headerFields
where
findForm = fmap Left . findIndex ((==) "form" . Text.toLower)
catchNothing = maybe (throwError NoFormColumn) return
buildContext :: MonadError Error m =>
Either Int Text -> Header -> m AnnotationContext
buildContext column = build . either splitAt (break . (==)) column
where
build (_, []) =
throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column
build (before, columnName:after) =
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.ConLLX (
getContext
fields
, sentences
) where
import Control.Applicative (many)
import Control.Monad.Except (MonadError(..))
import Data.Text.Lazy as Text (Text, pack, unpack)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1)
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
fields :: [Field]
fields = [
......@@ -44,17 +39,3 @@ sentence = many comment *> many1 row
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (sentence <* many1 eol)
getContext :: MonadError Error m => Maybe Text -> m AnnotationContext
getContext Nothing = return $ AnnotationContext {
columnIndex = 1
, columnName = "FORM"
, header = take 1 fields ++ drop 2 fields
}
getContext (Just columnName) =
case break (== columnName) fields of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.SSV (
body
, getContext
, fields
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Data.Text.Lazy as Text (Text, pack, unpack)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
field :: Stream s m Char => Char -> ParsecT s u m Field
field separator = Text.pack <$> (regular <|> quoted)
......@@ -27,18 +22,3 @@ fields separator = (field separator `sepBy` char separator) <* eol
body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields
getContext :: (Stream s m Char, MonadError Error m) =>
Char -> Maybe Text -> ParsecT s u m AnnotationContext
getContext separator formColumn = fields separator >>= aux formColumn
where
aux Nothing [] = throwError EmptyHeader
aux Nothing (columnName:header) =
return $ AnnotationContext {columnIndex = 0, columnName, header}
aux (Just columnName) headerRecord =
case break (== columnName) headerRecord of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment