Skip to content
Snippets Groups Projects
Commit ca22f6fe authored by Alice Brenon's avatar Alice Brenon
Browse files

Draft a very simple WebAnno reader ignoring the fields described in header and...

Draft a very simple WebAnno reader ignoring the fields described in header and taking a lot from TSV and CoNLL
parent 58a33992
No related branches found
No related tags found
No related merge requests found
...@@ -8,6 +8,7 @@ module Text.TEIWA ( ...@@ -8,6 +8,7 @@ module Text.TEIWA (
, annotateWith , annotateWith
, fromCSV , fromCSV
, fromCoNLLX , fromCoNLLX
, fromWebAnno
) where ) where
import Control.Monad.Except (MonadError(..)) import Control.Monad.Except (MonadError(..))
...@@ -34,3 +35,6 @@ fromCSV = annotate . Source csv ...@@ -34,3 +35,6 @@ fromCSV = annotate . Source csv
fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCoNLLX = annotate . Source coNLLX fromCoNLLX = annotate . Source coNLLX
fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromWebAnno = annotate . Source webAnno
...@@ -10,8 +10,10 @@ module Text.TEIWA.Source ( ...@@ -10,8 +10,10 @@ module Text.TEIWA.Source (
, runTEIWAParser , runTEIWAParser
, parse , parse
, tsv , tsv
, webAnno
) where ) where
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..)) import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.Reader (MonadReader(..), ReaderT(..))
import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.IO.Class (MonadIO(..))
...@@ -28,6 +30,7 @@ import Text.TEIWA.Source.Common ( ...@@ -28,6 +30,7 @@ import Text.TEIWA.Source.Common (
) )
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, header) import qualified Text.TEIWA.Source.SSV as SSV (body, header)
import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences)
type Format = TEIWAParser Annotation type Format = TEIWAParser Annotation
...@@ -69,7 +72,16 @@ csv :: Format ...@@ -69,7 +72,16 @@ csv :: Format
csv = ssv ',' csv = ssv ','
tsv :: Format tsv :: Format
tsv = ssv '\t' tsv = webAnno <|> ssv '\t'
webAnno :: Format
webAnno = do
context <- annotationContext =<< WebAnno.header
SentenceLevel <$> (
WebAnno.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
)
)
data Origin = File FilePath | Text Text data Origin = File FilePath | Text Text
......
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.TEIWA.Source.WebAnno (
header
, sentences
) where
import Control.Applicative (many)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (
ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string
)
import Text.TEIWA.Source.Common (
Field, Header, Row, TEIWAParser, eol, recordLine
)
header :: TEIWAParser Header
header = do
string "#FORMAT=WebAnno TSV " *> version
many comment *> pure ()
pure [
"ID"
, "SPAN"
, "FORM"
, "LABEL"
]
where
version = skipMany1 digit `sepBy1` char '.' *> eol
comment = char '#' *> many (noneOf "\r\n") <* eol
field :: Stream s m Char => ParsecT s u m Field
field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just . Text.pack $ unescape s
unescape "" = ""
unescape ('\\':c:s) = c:unescape s
unescape (c:s) = c:unescape s
row :: Stream s m Char => ParsecT s u m Row
row = recordLine (many1 (field <* char '\t') <* eol)
type Sentence = [Row]
sentence :: Stream s m Char => ParsecT s u m Sentence
sentence = many comment *> many1 row
where
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (many1 eol *> sentence)
...@@ -31,6 +31,7 @@ library ...@@ -31,6 +31,7 @@ library
, Text.TEIWA.Source.Common , Text.TEIWA.Source.Common
, Text.TEIWA.Source.CoNLLX , Text.TEIWA.Source.CoNLLX
, Text.TEIWA.Source.SSV , Text.TEIWA.Source.SSV
, Text.TEIWA.Source.WebAnno
build-depends: base >=4.12 && <4.15 build-depends: base >=4.12 && <4.15
, bytestring , bytestring
, mtl , mtl
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment