From 7f8047ee3d278248928dc2ef7fa2e7275b320803 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Tue, 2 Nov 2021 16:38:43 +0100 Subject: [PATCH] Fix CoNLL and WebAnno formats to pass the regression tests --- lib/Text/TEIWA/Source/Common.hs | 4 ++-- lib/Text/TEIWA/Source/WebAnno.hs | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index 787ac02..ba31db2 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -18,7 +18,7 @@ import Data.Char (isPunctuation) import Data.Text.Lazy as Text (Text, concat, head, length) import Text.Parsec ( Line, ParsecT, Stream, char, endOfLine, getParserState, many1, noneOf - , sourceLine, statePos, try + , sepEndBy, sourceLine, statePos, try ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) @@ -48,7 +48,7 @@ sentence row = many comment *> many1 row comment = char '#' *> many (noneOf "\r\n") <* eol sentences :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m [Sentence] -sentences row = many (many1 eol *> sentence row) +sentences row = sentence row `sepEndBy` many1 eol teiTagger :: Text -> Attributes -> Text teiTagger t _ diff --git a/lib/Text/TEIWA/Source/WebAnno.hs b/lib/Text/TEIWA/Source/WebAnno.hs index ed75a3e..f2f905b 100644 --- a/lib/Text/TEIWA/Source/WebAnno.hs +++ b/lib/Text/TEIWA/Source/WebAnno.hs @@ -23,10 +23,11 @@ header = do "ID" , "SPAN" , "FORM" - , "LABEL" + , "LEMMA" + , "POS" ] where - version = skipMany1 digit `sepBy1` char '.' *> eol + version = skipMany1 digit `sepBy1` char '.' *> eol comment = char '#' *> many (noneOf "\r\n") <* eol field :: Stream s m Char => ParsecT s u m Field @@ -45,7 +46,7 @@ webAnno :: Format webAnno = do context <- Context.ofHeader =<< header Annotations <$> ( - sentences row >>= mapM ( + (many eol *> sentences row) >>= mapM ( fmap (Node s_ . Annotations) . mapM (tagToken context) ) ) -- GitLab