{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} module Text.TEIWA.Source.CoNLLX ( coNLLX ) where import Control.Applicative ((<|>), many) import Data.Text.Lazy as Text (Text, concat) import Text.Parsec (ParsecT, Stream, char, string) import Text.TEIWA.Annotation (tagToken, teiSentence) import Text.TEIWA.Annotation.Data (Annotation(..), Format) import Text.TEIWA.Annotation.Context as Context (ofHeader) import Text.TEIWA.Source.Common ( Field, Header, Range(..), avoid, eol, int, range, sentences, toField ) header :: Header header = [ "ID" , "FORM" , "LEMMA" , "CPOSTAG" , "POSTAG" , "FEATS" , "HEAD" , "DEPREL" , "PHEAD" , "PDEPREL" ] fields :: Stream s m Char => ParsecT s u m [Field] fields = many (char '\t' *> (build <$> avoid "\t\n\r")) <* eol where build "_" = Nothing build other = Just other combineSubs :: [Maybe Text] -> [Maybe Text] -> [Maybe Text] combineSubs = zipWith combineFields where combineFields Nothing Nothing = Nothing combineFields Nothing f@(Just _) = f combineFields f@(Just _) Nothing = f combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b] row :: Stream s m Char => ParsecT s u m [Field] row = (range >>= rangeLines) <|> (int >>= singleLine) where singleLine n = (toField n:) <$> fields rangeLines r@(Range {from, to}) = do main <- fields subs <- mapM (\k -> string (show k) *> fields) [from .. to] pure $ (toField r):(zipWith (<|>) main $ foldl1 combineSubs subs) coNLLX :: Format coNLLX = do context <- Context.ofHeader header Annotations <$> ( sentences row >>= mapM ( fmap teiSentence . mapM (tagToken context) ) )