From f22f1393b60c4611d4bb658e54d8939b5a53dfc1 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Fri, 19 Mar 2021 20:12:32 +0100
Subject: [PATCH] Add support for --strictTEI option to enforce the use of
 TEI's LLA attributes only

---
 app/CLI.hs                                    | 15 ++++-
 lib/Text/TEIWA/Config.hs                      |  2 +
 lib/Text/TEIWA/Error.hs                       |  8 ++-
 lib/Text/TEIWA/Source.hs                      | 62 ++++++++++++-------
 lib/Text/TEIWA/Source/CSV.hs                  | 44 -------------
 .../TEIWA/Source/{ConLLX.hs => CoNLLX.hs}     | 27 ++------
 lib/Text/TEIWA/Source/Common.hs               | 35 ++++++++++-
 lib/Text/TEIWA/Source/SSV.hs                  | 24 +++++++
 teiwa.cabal                                   |  4 +-
 9 files changed, 123 insertions(+), 98 deletions(-)
 delete mode 100644 lib/Text/TEIWA/Source/CSV.hs
 rename lib/Text/TEIWA/Source/{ConLLX.hs => CoNLLX.hs} (50%)
 create mode 100644 lib/Text/TEIWA/Source/SSV.hs

diff --git a/app/CLI.hs b/app/CLI.hs
index 8d35258..7117008 100644
--- a/app/CLI.hs
+++ b/app/CLI.hs
@@ -4,6 +4,7 @@ module CLI (
     , getCommand
   ) where
 
+import Data.Text.Lazy as Text (Text, split)
 import Data.Version (showVersion)
 import Control.Applicative ((<*>), optional)
 import Options.Applicative (
@@ -21,19 +22,29 @@ data Command = Command {
 charPredicate :: ReadM (Char -> Bool)
 charPredicate = flip elem <$> (str :: ReadM String)
 
+csv :: ReadM [Text]
+csv = split (== ',') <$> str
+
 configOptions :: Parser Config
 configOptions = Config
-  <$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn
+  <$> option (optional str) (
+            short 'c' <> long "formColumn" <> value formColumn
+          <> metavar "COLUMN_NAME"
           <> help "the column to use as the form"
         )
+  <*> option (optional csv) (short 'H' <> long "header" <> value headerOverride
+          <> metavar "COLUMN_NAMES"
+          <> help "comma-separated names to use for the columns (can be empty to unselect a column)"
+        )
   <*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation
+          <> metavar "PUNCTUATION_CHARACTERS"
           <> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)"
         )
   <*> switch (short 's' <> long "strictTEI"
           <> help "only use TEI's att.linguistic on the elements"
         )
   where
-    Config {formColumn, punctuation} = defaultConfig
+    Config {formColumn, headerOverride, punctuation} = defaultConfig
 
 command :: Parser Command
 command = Command
diff --git a/lib/Text/TEIWA/Config.hs b/lib/Text/TEIWA/Config.hs
index f863274..457c34c 100644
--- a/lib/Text/TEIWA/Config.hs
+++ b/lib/Text/TEIWA/Config.hs
@@ -8,6 +8,7 @@ import Data.Text.Lazy (Text)
 
 data Config = Config {
       formColumn :: Maybe Text
+    , headerOverride :: Maybe [Text]
     , punctuation :: Char -> Bool
     , strictTEI :: Bool
   }
@@ -15,6 +16,7 @@ data Config = Config {
 defaultConfig :: Config
 defaultConfig = Config {
       formColumn = Nothing
+    , headerOverride = Nothing
     , punctuation = isPunctuation
     , strictTEI = False
   }
diff --git a/lib/Text/TEIWA/Error.hs b/lib/Text/TEIWA/Error.hs
index e5b4b59..2cb5940 100644
--- a/lib/Text/TEIWA/Error.hs
+++ b/lib/Text/TEIWA/Error.hs
@@ -7,14 +7,18 @@ import Text.Printf (printf)
 
 data Error =
     NoSuchColumn String
+  | NoFormColumn
   | EmptyHeader
   | MissingColumn Line String
   | ParsingError ParseError
   | TermNotFound String
 
 instance Show Error where
-  show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s
-  show EmptyHeader = printf "The CSV file header is empty"
+  show (NoSuchColumn s) =
+    printf "\"%s\" isn't a valid column name in this file" s
+  show NoFormColumn =
+    "No \"form\" column has been found, select one with --formColumn"
+  show EmptyHeader = "The CSV file header is empty"
   show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s
   show (ParsingError e) = show e
   show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs
index 1a1db4d..6c906f7 100644
--- a/lib/Text/TEIWA/Source.hs
+++ b/lib/Text/TEIWA/Source.hs
@@ -1,9 +1,9 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE OverloadedStrings #-}
 module Text.TEIWA.Source (
       Format
     , Origin(..)
-    , TEIWAParser
     , Source(..)
     , coNLLX
     , csv
@@ -13,48 +13,62 @@ module Text.TEIWA.Source (
   ) where
 
 import Control.Monad.Except (MonadError(..))
+import Control.Monad.Reader (MonadReader(..), ReaderT(..))
 import Control.Monad.IO.Class (MonadIO(..))
-import Data.Text.Lazy as Text (Text, unpack)
+import Data.Text.Lazy as Text (Text, toLower, unpack)
 import Data.Text.Lazy.IO as Text (readFile)
-import Text.Parsec (ParsecT, SourceName, runParserT)
+import Text.Parsec (SourceName, runParserT)
 import Text.TEIWA.Annotation (
     Annotation(..), SentenceAnnotation(..), TokenAnnotation(..)
   )
 import Text.TEIWA.Config (Config(..))
 import Text.TEIWA.Error (Error(..))
-import Text.TEIWA.Source.Common (AnnotationContext(..), Row)
-import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences)
-import qualified Text.TEIWA.Source.CSV as CSV (body, getContext)
+import Text.TEIWA.Source.Common (
+    AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
+  )
+import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences)
+import qualified Text.TEIWA.Source.SSV as SSV (body, fields)
+
+type Format = TEIWAParser Annotation
 
-type TEIWAParser = ParsecT Text () (Either Error)
-type Format = Config -> TEIWAParser Annotation
+isTEI :: Field -> Bool
+isTEI = (`elem` ["pos", "lemma", "msd"])
 
-annotateToken :: MonadError Error m =>
+filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)]
+filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
+  where
+    filterIf _ ("", _) = []
+    filterIf False (k, v) = [(k, v)]
+    filterIf True (k, v) =
+      let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
+
+annotateToken :: (MonadError Error m, MonadReader Config m) =>
   AnnotationContext -> Row -> m TokenAnnotation
 annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
   case splitAt columnIndex record of
     (_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName
     (before, form:after) ->
-      return $ TokenAnnotation {form, annotated = zip header (before ++ after)}
+      TokenAnnotation form <$> filterAttributes (zip header $ before ++ after)
 
 coNLLX :: Format
-coNLLX (Config {formColumn}) = do
-  context <- ConLLX.getContext formColumn
+coNLLX = do
+  context <- annotationContext CoNLLX.fields
   SentenceLevel <$> (
-      ConLLX.sentences >>= mapM (
+      CoNLLX.sentences >>= mapM (
           fmap SentenceAnnotation . mapM (annotateToken context)
         )
     )
 
+ssv :: Char -> Format
+ssv separator = do
+  context <- annotationContext =<< SSV.fields separator
+  TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
+
 csv :: Format
-csv (Config {formColumn}) = do
-  context <- CSV.getContext ',' formColumn
-  TokenLevel <$> (CSV.body ',' >>= mapM (annotateToken context))
+csv = ssv ','
 
 tsv :: Format
-tsv (Config {formColumn}) = do
-  context <- CSV.getContext '\t' formColumn
-  TokenLevel <$> (CSV.body '\t' >>= mapM (annotateToken context))
+tsv = ssv '\t'
 
 data Origin = File FilePath | Text Text
 
@@ -64,14 +78,14 @@ data Source = Source {
   }
 
 runTEIWAParser :: MonadError Error m =>
-  TEIWAParser a -> SourceName -> Text -> m a
-runTEIWAParser p s = flattenErrors . runParserT p () s
+  Config -> TEIWAParser a -> SourceName -> Text -> m a
+runTEIWAParser config p s = flattenErrors . (`runReaderT` config) . runParserT p () s
   where
     flattenErrors = either throwError (either (throwError . ParsingError) pure)
 
 parse :: (MonadIO m, MonadError Error m) =>
   Config -> Source -> m Annotation
-parse config (Source {format, origin}) = parseFrom (format config) origin
+parse config (Source {format, origin}) = parseFrom format origin
   where
-    parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser p f
-    parseFrom p (Text t) = runTEIWAParser p "" t
+    parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser config p f
+    parseFrom p (Text t) = runTEIWAParser config p "" t
diff --git a/lib/Text/TEIWA/Source/CSV.hs b/lib/Text/TEIWA/Source/CSV.hs
deleted file mode 100644
index eed165c..0000000
--- a/lib/Text/TEIWA/Source/CSV.hs
+++ /dev/null
@@ -1,44 +0,0 @@
-{-# LANGUAGE NamedFieldPuns #-}
-{-# LANGUAGE FlexibleContexts #-}
-module Text.TEIWA.Source.CSV (
-      body
-    , getContext
-  ) where
-
-import Control.Applicative ((<|>), many)
-import Control.Monad.Except (MonadError(..))
-import Data.Text.Lazy as Text (Text, pack, unpack)
-import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
-import Text.TEIWA.Error (Error(..))
-import Text.TEIWA.Source.Common (
-    AnnotationContext(..), Field, Row, eol, recordLine
-  )
-
-field :: Stream s m Char => Char -> ParsecT s u m Field
-field separator = Text.pack <$> (regular <|> quoted)
-  where
-    regular = many (noneOf $ separator:"\n\r\"")
-    quoted = between quote quote $
-      many (noneOf "\"" <|> try (string "\"\"" *> pure '"'))
-    quote = char '"'
-
-fields :: Stream s m Char => Char -> ParsecT s u m [Field]
-fields separator = (field separator `sepBy` char separator) <* eol
-
-body :: Stream s m Char => Char -> ParsecT s u m [Row]
-body = many . recordLine . fields
-
-getContext :: (Stream s m Char, MonadError Error m) =>
-  Char -> Maybe Text -> ParsecT s u m AnnotationContext
-getContext separator formColumn = fields separator >>= aux formColumn
-  where
-    aux Nothing [] = throwError EmptyHeader
-    aux Nothing (columnName:header) =
-      return $ AnnotationContext {columnIndex = 0, columnName, header}
-    aux (Just columnName) headerRecord =
-      case break (== columnName) headerRecord of
-        (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
-        (before, _:after) ->
-          let columnIndex = length before in
-          let header = (before ++ after) in
-          return $ AnnotationContext {columnIndex, columnName, header}
diff --git a/lib/Text/TEIWA/Source/ConLLX.hs b/lib/Text/TEIWA/Source/CoNLLX.hs
similarity index 50%
rename from lib/Text/TEIWA/Source/ConLLX.hs
rename to lib/Text/TEIWA/Source/CoNLLX.hs
index 2a04a0e..5f2d517 100644
--- a/lib/Text/TEIWA/Source/ConLLX.hs
+++ b/lib/Text/TEIWA/Source/CoNLLX.hs
@@ -1,19 +1,14 @@
-{-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE FlexibleContexts #-}
-module Text.TEIWA.Source.ConLLX (
-      getContext
+module Text.TEIWA.Source.CoNLLX (
+      fields
     , sentences
   ) where
 
 import Control.Applicative (many)
-import Control.Monad.Except (MonadError(..))
-import Data.Text.Lazy as Text (Text, pack, unpack)
+import Data.Text.Lazy as Text (pack)
 import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1)
-import Text.TEIWA.Error (Error(..))
-import Text.TEIWA.Source.Common (
-    AnnotationContext(..), Field, Row, eol, recordLine
-  )
+import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
 
 fields :: [Field]
 fields = [
@@ -44,17 +39,3 @@ sentence = many comment *> many1 row
 
 sentences :: Stream s m Char => ParsecT s u m [Sentence]
 sentences = many (sentence <* many1 eol)
-
-getContext :: MonadError Error m => Maybe Text -> m AnnotationContext
-getContext Nothing = return $ AnnotationContext {
-      columnIndex = 1
-    , columnName = "FORM"
-    , header = take 1 fields ++ drop 2 fields
-  }
-getContext (Just columnName) =
-  case break (== columnName) fields of
-    (_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
-    (before, _:after) ->
-      let columnIndex = length before in
-      let header = (before ++ after) in
-      return $ AnnotationContext {columnIndex, columnName, header}
diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs
index efb5688..83839e9 100644
--- a/lib/Text/TEIWA/Source/Common.hs
+++ b/lib/Text/TEIWA/Source/Common.hs
@@ -1,18 +1,27 @@
 {-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
 module Text.TEIWA.Source.Common (
       AnnotationContext(..)
     , Field
     , Row
+    , TEIWAParser
+    , annotationContext
     , eol
     , recordLine
   ) where
 
 import Control.Applicative ((<|>))
-import Data.Text.Lazy as Text (Text)
+import Control.Monad.Except (MonadError(..))
+import Control.Monad.Reader (MonadReader(..), ReaderT)
+import Data.List (findIndex)
+import Data.Text.Lazy as Text (Text, toLower, unpack)
 import Text.Parsec (
       Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos
     , try
   )
+import Text.TEIWA.Config (Config(..))
+import Text.TEIWA.Error (Error(..))
 
 data AnnotationContext = AnnotationContext {
       columnIndex :: Column
@@ -24,6 +33,8 @@ type Field = Text
 type Header = [Text]
 type Row = (Line, [Field])
 
+type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error))
+
 eol :: Stream s m Char => ParsecT s u m ()
 eol = (try endOfLine <|> char '\r') *> return ()
 
@@ -31,3 +42,25 @@ recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a)
 recordLine p = (,) <$> currentLine <*> p
   where
     currentLine = sourceLine . statePos <$> getParserState
+
+annotationContext :: Header -> TEIWAParser AnnotationContext
+annotationContext defaultFields = do
+  Config {formColumn, headerOverride} <- ask
+  --headerFields <- maybe defaultFields pure headerOverride
+  let headerFields = maybe defaultFields id headerOverride
+  column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields)
+  buildContext column headerFields
+  where
+    findForm = fmap Left . findIndex ((==) "form" . Text.toLower)
+    catchNothing = maybe (throwError NoFormColumn) return
+
+buildContext :: MonadError Error m =>
+  Either Int Text -> Header -> m AnnotationContext
+buildContext column = build . either splitAt (break . (==)) column
+  where
+    build (_, []) =
+      throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column
+    build (before, columnName:after) =
+      let columnIndex = length before in
+      let header = (before ++ after) in
+      return $ AnnotationContext {columnIndex, columnName, header}
diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs
new file mode 100644
index 0000000..43c9e3c
--- /dev/null
+++ b/lib/Text/TEIWA/Source/SSV.hs
@@ -0,0 +1,24 @@
+{-# LANGUAGE FlexibleContexts #-}
+module Text.TEIWA.Source.SSV (
+      body
+    , fields
+  ) where
+
+import Control.Applicative ((<|>), many)
+import Data.Text.Lazy as Text (pack)
+import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
+import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
+
+field :: Stream s m Char => Char -> ParsecT s u m Field
+field separator = Text.pack <$> (regular <|> quoted)
+  where
+    regular = many (noneOf $ separator:"\n\r\"")
+    quoted = between quote quote $
+      many (noneOf "\"" <|> try (string "\"\"" *> pure '"'))
+    quote = char '"'
+
+fields :: Stream s m Char => Char -> ParsecT s u m [Field]
+fields separator = (field separator `sepBy` char separator) <* eol
+
+body :: Stream s m Char => Char -> ParsecT s u m [Row]
+body = many . recordLine . fields
diff --git a/teiwa.cabal b/teiwa.cabal
index 01eb1bc..8e84544 100644
--- a/teiwa.cabal
+++ b/teiwa.cabal
@@ -29,8 +29,8 @@ library
                      , Text.TEIWA.Error
                      , Text.TEIWA.Source
                      , Text.TEIWA.Source.Common
-                     , Text.TEIWA.Source.ConLLX
-                     , Text.TEIWA.Source.CSV
+                     , Text.TEIWA.Source.CoNLLX
+                     , Text.TEIWA.Source.SSV
   build-depends:       base >=4.12 && <4.15
                      , bytestring
                      , mtl
-- 
GitLab