Skip to content
Snippets Groups Projects
CoNLLX.hs 1.69 KiB
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.CoNLLX (
    coNLLX
  ) where

import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (Text, concat)
import Text.Parsec (ParsecT, Stream, char, string)
import Text.TEIWA.Annotation (tagToken, teiSentence)
import Text.TEIWA.Annotation.Data (Annotation(..), Format)
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Source.Common (
    Field, Header, Range(..), avoid, eol, int, range, sentences, toField
  )

header :: Header
header = [
      "ID"
    , "FORM"
    , "LEMMA"
    , "CPOSTAG"
    , "POSTAG"
    , "FEATS"
    , "HEAD"
    , "DEPREL"
    , "PHEAD"
    , "PDEPREL"
  ]

fields :: Stream s m Char => ParsecT s u m [Field]
fields = many (char '\t' *> (build <$> avoid "\t\n\r")) <* eol
  where
    build "_" = Nothing
    build other = Just other

combineSubs :: [Maybe Text] -> [Maybe Text] -> [Maybe Text]
combineSubs = zipWith combineFields
  where
    combineFields Nothing Nothing = Nothing
    combineFields Nothing f@(Just _) = f
    combineFields f@(Just _) Nothing = f
    combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]

row :: Stream s m Char => ParsecT s u m [Field]
row = (range >>= rangeLines) <|> (int >>= singleLine)
  where
    singleLine n = (toField n:) <$> fields
    rangeLines r@(Range {from, to}) = do
      main <- fields
      subs <- mapM (\k -> string (show k) *> fields) [from .. to]
      pure $ (toField r):(zipWith (<|>) main $ foldl1 combineSubs subs)

coNLLX :: Format
coNLLX = do
  context <- Context.ofHeader header
  Annotations <$> (
      sentences row >>= mapM (
          fmap teiSentence . mapM (tagToken context)
        )
    )