-
Alice Brenon authored
Finish implementing the annotation-building process for WebAnno + add a lot of unit and regression tests
f8a0f4b1
CoNLLX.hs 1.69 KiB
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.CoNLLX (
coNLLX
) where
import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (Text, concat)
import Text.Parsec (ParsecT, Stream, char, string)
import Text.TEIWA.Annotation (tagToken, teiSentence)
import Text.TEIWA.Annotation.Data (Annotation(..), Format)
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Source.Common (
Field, Header, Range(..), avoid, eol, int, range, sentences, toField
)
header :: Header
header = [
"ID"
, "FORM"
, "LEMMA"
, "CPOSTAG"
, "POSTAG"
, "FEATS"
, "HEAD"
, "DEPREL"
, "PHEAD"
, "PDEPREL"
]
fields :: Stream s m Char => ParsecT s u m [Field]
fields = many (char '\t' *> (build <$> avoid "\t\n\r")) <* eol
where
build "_" = Nothing
build other = Just other
combineSubs :: [Maybe Text] -> [Maybe Text] -> [Maybe Text]
combineSubs = zipWith combineFields
where
combineFields Nothing Nothing = Nothing
combineFields Nothing f@(Just _) = f
combineFields f@(Just _) Nothing = f
combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
row :: Stream s m Char => ParsecT s u m [Field]
row = (range >>= rangeLines) <|> (int >>= singleLine)
where
singleLine n = (toField n:) <$> fields
rangeLines r@(Range {from, to}) = do
main <- fields
subs <- mapM (\k -> string (show k) *> fields) [from .. to]
pure $ (toField r):(zipWith (<|>) main $ foldl1 combineSubs subs)
coNLLX :: Format
coNLLX = do
context <- Context.ofHeader header
Annotations <$> (
sentences row >>= mapM (
fmap teiSentence . mapM (tagToken context)
)
)