diff --git a/lib/Text/TEIWA.hs b/lib/Text/TEIWA.hs index 3024941cb45344e5aeb419b543f9584ea5248702..d4ad823e5cdc672ec20a80ef841238c0c58e5411 100644 --- a/lib/Text/TEIWA.hs +++ b/lib/Text/TEIWA.hs @@ -8,6 +8,7 @@ module Text.TEIWA ( , annotateWith , fromCSV , fromCoNLLX + , fromWebAnno ) where import Control.Monad.Except (MonadError(..)) @@ -34,3 +35,6 @@ fromCSV = annotate . Source csv fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m fromCoNLLX = annotate . Source coNLLX + +fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m +fromWebAnno = annotate . Source webAnno diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 0858fcb770e1883b9b6215172ff5b099a601fad0..d6b9f66abbd52ea06780d36982b9cebf99b45e2c 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -10,8 +10,10 @@ module Text.TEIWA.Source ( , runTEIWAParser , parse , tsv + , webAnno ) where +import Control.Applicative ((<|>)) import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) @@ -28,6 +30,7 @@ import Text.TEIWA.Source.Common ( ) import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) import qualified Text.TEIWA.Source.SSV as SSV (body, header) +import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences) type Format = TEIWAParser Annotation @@ -69,7 +72,16 @@ csv :: Format csv = ssv ',' tsv :: Format -tsv = ssv '\t' +tsv = webAnno <|> ssv '\t' + +webAnno :: Format +webAnno = do + context <- annotationContext =<< WebAnno.header + SentenceLevel <$> ( + WebAnno.sentences >>= mapM ( + fmap SentenceAnnotation . mapM (annotateToken context) + ) + ) data Origin = File FilePath | Text Text diff --git a/lib/Text/TEIWA/Source/WebAnno.hs b/lib/Text/TEIWA/Source/WebAnno.hs new file mode 100644 index 0000000000000000000000000000000000000000..a20a2a91ba4d6f727d8eca6eaa78cb3ddb3b96bc --- /dev/null +++ b/lib/Text/TEIWA/Source/WebAnno.hs @@ -0,0 +1,51 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.TEIWA.Source.WebAnno ( + header + , sentences + ) where + +import Control.Applicative (many) +import Data.Text.Lazy as Text (pack) +import Text.Parsec ( + ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string + ) +import Text.TEIWA.Source.Common ( + Field, Header, Row, TEIWAParser, eol, recordLine + ) + +header :: TEIWAParser Header +header = do + string "#FORMAT=WebAnno TSV " *> version + many comment *> pure () + pure [ + "ID" + , "SPAN" + , "FORM" + , "LABEL" + ] + where + version = skipMany1 digit `sepBy1` char '.' *> eol + comment = char '#' *> many (noneOf "\r\n") <* eol + +field :: Stream s m Char => ParsecT s u m Field +field = build <$> many1 (noneOf "\t\n\r") + where + build "_" = Nothing + build s = Just . Text.pack $ unescape s + unescape "" = "" + unescape ('\\':c:s) = c:unescape s + unescape (c:s) = c:unescape s + +row :: Stream s m Char => ParsecT s u m Row +row = recordLine (many1 (field <* char '\t') <* eol) + +type Sentence = [Row] + +sentence :: Stream s m Char => ParsecT s u m Sentence +sentence = many comment *> many1 row + where + comment = char '#' *> many (noneOf "\r\n") <* eol + +sentences :: Stream s m Char => ParsecT s u m [Sentence] +sentences = many (many1 eol *> sentence) diff --git a/teiwa.cabal b/teiwa.cabal index 9760cffc4661004d147905521390dd49fb42a3ec..bfd3856cd6f55aadd3d36ef0012ec2d674fdbe59 100644 --- a/teiwa.cabal +++ b/teiwa.cabal @@ -31,6 +31,7 @@ library , Text.TEIWA.Source.Common , Text.TEIWA.Source.CoNLLX , Text.TEIWA.Source.SSV + , Text.TEIWA.Source.WebAnno build-depends: base >=4.12 && <4.15 , bytestring , mtl