From ca22f6fe3b2ee25b57f6730222636f289e6d89e2 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 19 Aug 2021 10:38:04 +0200 Subject: [PATCH] Draft a very simple WebAnno reader ignoring the fields described in header and taking a lot from TSV and CoNLL --- lib/Text/TEIWA.hs | 4 +++ lib/Text/TEIWA/Source.hs | 14 ++++++++- lib/Text/TEIWA/Source/WebAnno.hs | 51 ++++++++++++++++++++++++++++++++ teiwa.cabal | 1 + 4 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 lib/Text/TEIWA/Source/WebAnno.hs diff --git a/lib/Text/TEIWA.hs b/lib/Text/TEIWA.hs index 3024941..d4ad823 100644 --- a/lib/Text/TEIWA.hs +++ b/lib/Text/TEIWA.hs @@ -8,6 +8,7 @@ module Text.TEIWA ( , annotateWith , fromCSV , fromCoNLLX + , fromWebAnno ) where import Control.Monad.Except (MonadError(..)) @@ -34,3 +35,6 @@ fromCSV = annotate . Source csv fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m fromCoNLLX = annotate . Source coNLLX + +fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m +fromWebAnno = annotate . Source webAnno diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 0858fcb..d6b9f66 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -10,8 +10,10 @@ module Text.TEIWA.Source ( , runTEIWAParser , parse , tsv + , webAnno ) where +import Control.Applicative ((<|>)) import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) @@ -28,6 +30,7 @@ import Text.TEIWA.Source.Common ( ) import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) import qualified Text.TEIWA.Source.SSV as SSV (body, header) +import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences) type Format = TEIWAParser Annotation @@ -69,7 +72,16 @@ csv :: Format csv = ssv ',' tsv :: Format -tsv = ssv '\t' +tsv = webAnno <|> ssv '\t' + +webAnno :: Format +webAnno = do + context <- annotationContext =<< WebAnno.header + SentenceLevel <$> ( + WebAnno.sentences >>= mapM ( + fmap SentenceAnnotation . mapM (annotateToken context) + ) + ) data Origin = File FilePath | Text Text diff --git a/lib/Text/TEIWA/Source/WebAnno.hs b/lib/Text/TEIWA/Source/WebAnno.hs new file mode 100644 index 0000000..a20a2a9 --- /dev/null +++ b/lib/Text/TEIWA/Source/WebAnno.hs @@ -0,0 +1,51 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.TEIWA.Source.WebAnno ( + header + , sentences + ) where + +import Control.Applicative (many) +import Data.Text.Lazy as Text (pack) +import Text.Parsec ( + ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string + ) +import Text.TEIWA.Source.Common ( + Field, Header, Row, TEIWAParser, eol, recordLine + ) + +header :: TEIWAParser Header +header = do + string "#FORMAT=WebAnno TSV " *> version + many comment *> pure () + pure [ + "ID" + , "SPAN" + , "FORM" + , "LABEL" + ] + where + version = skipMany1 digit `sepBy1` char '.' *> eol + comment = char '#' *> many (noneOf "\r\n") <* eol + +field :: Stream s m Char => ParsecT s u m Field +field = build <$> many1 (noneOf "\t\n\r") + where + build "_" = Nothing + build s = Just . Text.pack $ unescape s + unescape "" = "" + unescape ('\\':c:s) = c:unescape s + unescape (c:s) = c:unescape s + +row :: Stream s m Char => ParsecT s u m Row +row = recordLine (many1 (field <* char '\t') <* eol) + +type Sentence = [Row] + +sentence :: Stream s m Char => ParsecT s u m Sentence +sentence = many comment *> many1 row + where + comment = char '#' *> many (noneOf "\r\n") <* eol + +sentences :: Stream s m Char => ParsecT s u m [Sentence] +sentences = many (many1 eol *> sentence) diff --git a/teiwa.cabal b/teiwa.cabal index 9760cff..bfd3856 100644 --- a/teiwa.cabal +++ b/teiwa.cabal @@ -31,6 +31,7 @@ library , Text.TEIWA.Source.Common , Text.TEIWA.Source.CoNLLX , Text.TEIWA.Source.SSV + , Text.TEIWA.Source.WebAnno build-depends: base >=4.12 && <4.15 , bytestring , mtl -- GitLab