Skip to content
Snippets Groups Projects
Commit 619e9c17 authored by Alice Brenon's avatar Alice Brenon
Browse files

Support CoNLL-X's representation of contractions (labeled with a range instead...

Support CoNLL-X's representation of contractions (labeled with a range instead of a regular id and expanded on the following lines)
parent f22f1393
No related branches found
No related tags found
No related merge requests found
......@@ -19,40 +19,41 @@ import Data.Text.Lazy as Text (Text, toLower, unpack)
import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (SourceName, runParserT)
import Text.TEIWA.Annotation (
Annotation(..), SentenceAnnotation(..), TokenAnnotation(..)
Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..)
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
)
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, fields)
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, header)
type Format = TEIWAParser Annotation
isTEI :: Field -> Bool
isTEI :: Text -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"])
filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)]
filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where
filterIf _ ("", _) = []
filterIf False (k, v) = [(k, v)]
filterIf True (k, v) =
filterIf _ (_, Nothing) = []
filterIf False (k, Just v) = [(k, v)]
filterIf True (k, Just v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
annotateToken :: (MonadError Error m, MonadReader Config m) =>
AnnotationContext -> Row -> m TokenAnnotation
annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
case splitAt columnIndex record of
(_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName
(before, form:after) ->
(before, (Just form):after) ->
TokenAnnotation form <$> filterAttributes (zip header $ before ++ after)
_ -> throwError . MissingColumn atLine $ Text.unpack columnName
coNLLX :: Format
coNLLX = do
context <- annotationContext CoNLLX.fields
context <- annotationContext CoNLLX.header
SentenceLevel <$> (
CoNLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
......@@ -61,7 +62,7 @@ coNLLX = do
ssv :: Char -> Format
ssv separator = do
context <- annotationContext =<< SSV.fields separator
context <- annotationContext =<< SSV.header separator
TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
csv :: Format
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.CoNLLX (
fields
header
, sentences
) where
import Control.Applicative (many)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (concat, pack)
import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try)
import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine)
fields :: [Field]
fields = [
header :: Header
header = [
"ID"
, "FORM"
, "LEMMA"
......@@ -25,10 +25,34 @@ fields = [
]
field :: Stream s m Char => ParsecT s u m Field
field = Text.pack <$> many1 (noneOf "\t\n\r")
field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just $ Text.pack s
type Range = (Int, Int)
rowID :: Stream s m Char => ParsecT s u m (Either Range Int)
rowID = (Left <$> try ((,) <$> int <* char '-' <*> int)) <|> (Right <$> int)
where
int = read <$> many1 digit
row :: Stream s m Char => ParsecT s u m Row
row = recordLine (field `sepBy1` char '\t') <* eol
row = recordLine (rowID >>= either range singleLine)
where
toText = Text.pack . show
rest = many (char '\t' *> field) <* eol
singleLine n = (Just (toText n):) <$> rest
range (from, to) =
let rangeID = Text.concat [toText from, "-", toText to] in do
main <- rest
subs <- mapM (\k -> string (show k) *> rest) [from .. to]
return $ (Just rangeID):(zipWith (<|>) main $ foldl1 combineSubs subs)
combineSubs = zipWith combineFields
combineFields Nothing Nothing = Nothing
combineFields Nothing f@(Just _) = f
combineFields f@(Just _) Nothing = f
combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
type Sentence = [Row]
......
......@@ -4,6 +4,7 @@
module Text.TEIWA.Source.Common (
AnnotationContext(..)
, Field
, Header
, Row
, TEIWAParser
, annotationContext
......@@ -29,7 +30,7 @@ data AnnotationContext = AnnotationContext {
, header :: Header
}
type Field = Text
type Field = Maybe Text
type Header = [Text]
type Row = (Line, [Field])
......
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.SSV (
body
, fields
, header
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Data.Maybe (catMaybes)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine)
import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine)
import Text.TEIWA.Error (Error(..))
field :: Stream s m Char => Char -> ParsecT s u m Field
field separator = Text.pack <$> (regular <|> quoted)
field separator = notEmpty <$> (regular <|> quoted)
where
notEmpty "" = Nothing
notEmpty s = Just $ Text.pack s
regular = many (noneOf $ separator:"\n\r\"")
quoted = between quote quote $
many (noneOf "\"" <|> try (string "\"\"" *> pure '"'))
......@@ -20,5 +25,11 @@ field separator = Text.pack <$> (regular <|> quoted)
fields :: Stream s m Char => Char -> ParsecT s u m [Field]
fields separator = (field separator `sepBy` char separator) <* eol
header :: Char -> TEIWAParser Header
header separator = fields separator >>= ensureNotEmpty . catMaybes
where
ensureNotEmpty [] = throwError EmptyHeader
ensureNotEmpty l = return l
body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment