Skip to content
Snippets Groups Projects
Commit ca22f6fe authored by Alice Brenon's avatar Alice Brenon
Browse files

Draft a very simple WebAnno reader ignoring the fields described in header and...

Draft a very simple WebAnno reader ignoring the fields described in header and taking a lot from TSV and CoNLL
parent 58a33992
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,7 @@ module Text.TEIWA (
, annotateWith
, fromCSV
, fromCoNLLX
, fromWebAnno
) where
import Control.Monad.Except (MonadError(..))
......@@ -34,3 +35,6 @@ fromCSV = annotate . Source csv
fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCoNLLX = annotate . Source coNLLX
fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromWebAnno = annotate . Source webAnno
......@@ -10,8 +10,10 @@ module Text.TEIWA.Source (
, runTEIWAParser
, parse
, tsv
, webAnno
) where
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT(..))
import Control.Monad.IO.Class (MonadIO(..))
......@@ -28,6 +30,7 @@ import Text.TEIWA.Source.Common (
)
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, header)
import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences)
type Format = TEIWAParser Annotation
......@@ -69,7 +72,16 @@ csv :: Format
csv = ssv ','
tsv :: Format
tsv = ssv '\t'
tsv = webAnno <|> ssv '\t'
webAnno :: Format
webAnno = do
context <- annotationContext =<< WebAnno.header
SentenceLevel <$> (
WebAnno.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
)
)
data Origin = File FilePath | Text Text
......
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.TEIWA.Source.WebAnno (
header
, sentences
) where
import Control.Applicative (many)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (
ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string
)
import Text.TEIWA.Source.Common (
Field, Header, Row, TEIWAParser, eol, recordLine
)
header :: TEIWAParser Header
header = do
string "#FORMAT=WebAnno TSV " *> version
many comment *> pure ()
pure [
"ID"
, "SPAN"
, "FORM"
, "LABEL"
]
where
version = skipMany1 digit `sepBy1` char '.' *> eol
comment = char '#' *> many (noneOf "\r\n") <* eol
field :: Stream s m Char => ParsecT s u m Field
field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just . Text.pack $ unescape s
unescape "" = ""
unescape ('\\':c:s) = c:unescape s
unescape (c:s) = c:unescape s
row :: Stream s m Char => ParsecT s u m Row
row = recordLine (many1 (field <* char '\t') <* eol)
type Sentence = [Row]
sentence :: Stream s m Char => ParsecT s u m Sentence
sentence = many comment *> many1 row
where
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (many1 eol *> sentence)
......@@ -31,6 +31,7 @@ library
, Text.TEIWA.Source.Common
, Text.TEIWA.Source.CoNLLX
, Text.TEIWA.Source.SSV
, Text.TEIWA.Source.WebAnno
build-depends: base >=4.12 && <4.15
, bytestring
, mtl
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment