From 09f7f0576290aab3ceae2e374de33908e170dd93 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 18 Mar 2021 10:00:26 +0100 Subject: [PATCH] Implement strictTEI filtering --- lib/Text/TEIWA/Source.hs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index b4664db..48b6fb9 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -1,5 +1,6 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} module Text.TEIWA.Source ( Format , Origin(..) @@ -15,7 +16,7 @@ module Text.TEIWA.Source ( import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) -import Data.Text.Lazy as Text (Text, unpack) +import Data.Text.Lazy as Text (Text, toLower, unpack) import Data.Text.Lazy.IO as Text (readFile) import Text.Parsec (ParsecT, SourceName, runParserT) import Text.TEIWA.Annotation ( @@ -23,20 +24,30 @@ import Text.TEIWA.Annotation ( ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common (AnnotationContext(..), Row) +import Text.TEIWA.Source.Common (AnnotationContext(..), Field, Row) import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences) import qualified Text.TEIWA.Source.SSV as SSV (body, getContext) type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) type Format = TEIWAParser Annotation +isTEI :: Field -> Bool +isTEI = (`elem` ["pos", "lemma", "msd"]) + +filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)] +filterAttributes attributes = reader $ filterIf . strictTEI + where + filterIf False = attributes + filterIf True = + [(k', v) | (k, v) <- attributes, let k' = Text.toLower k, isTEI k'] + annotateToken :: (MonadError Error m, MonadReader Config m) => AnnotationContext -> Row -> m TokenAnnotation annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = case splitAt columnIndex record of (_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName (before, form:after) -> - return $ TokenAnnotation {form, annotated = zip header (before ++ after)} + TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) coNLLX :: Format coNLLX = do -- GitLab