Skip to content
Snippets Groups Projects
Commit 619e9c17 authored by Alice Brenon's avatar Alice Brenon
Browse files

Support CoNLL-X's representation of contractions (labeled with a range instead...

Support CoNLL-X's representation of contractions (labeled with a range instead of a regular id and expanded on the following lines)
parent f22f1393
No related branches found
No related tags found
No related merge requests found
...@@ -19,40 +19,41 @@ import Data.Text.Lazy as Text (Text, toLower, unpack) ...@@ -19,40 +19,41 @@ import Data.Text.Lazy as Text (Text, toLower, unpack)
import Data.Text.Lazy.IO as Text (readFile) import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (SourceName, runParserT) import Text.Parsec (SourceName, runParserT)
import Text.TEIWA.Annotation ( import Text.TEIWA.Annotation (
Annotation(..), SentenceAnnotation(..), TokenAnnotation(..) Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..)
) )
import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common ( import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, TEIWAParser, annotationContext AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
) )
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences) import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, fields) import qualified Text.TEIWA.Source.SSV as SSV (body, header)
type Format = TEIWAParser Annotation type Format = TEIWAParser Annotation
isTEI :: Field -> Bool isTEI :: Text -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"]) isTEI = (`elem` ["pos", "lemma", "msd"])
filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)] filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where where
filterIf _ ("", _) = [] filterIf _ ("", _) = []
filterIf False (k, v) = [(k, v)] filterIf _ (_, Nothing) = []
filterIf True (k, v) = filterIf False (k, Just v) = [(k, v)]
filterIf True (k, Just v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
annotateToken :: (MonadError Error m, MonadReader Config m) => annotateToken :: (MonadError Error m, MonadReader Config m) =>
AnnotationContext -> Row -> m TokenAnnotation AnnotationContext -> Row -> m TokenAnnotation
annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
case splitAt columnIndex record of case splitAt columnIndex record of
(_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName (before, (Just form):after) ->
(before, form:after) ->
TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) TokenAnnotation form <$> filterAttributes (zip header $ before ++ after)
_ -> throwError . MissingColumn atLine $ Text.unpack columnName
coNLLX :: Format coNLLX :: Format
coNLLX = do coNLLX = do
context <- annotationContext CoNLLX.fields context <- annotationContext CoNLLX.header
SentenceLevel <$> ( SentenceLevel <$> (
CoNLLX.sentences >>= mapM ( CoNLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context) fmap SentenceAnnotation . mapM (annotateToken context)
...@@ -61,7 +62,7 @@ coNLLX = do ...@@ -61,7 +62,7 @@ coNLLX = do
ssv :: Char -> Format ssv :: Char -> Format
ssv separator = do ssv separator = do
context <- annotationContext =<< SSV.fields separator context <- annotationContext =<< SSV.header separator
TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
csv :: Format csv :: Format
......
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.CoNLLX ( module Text.TEIWA.Source.CoNLLX (
fields header
, sentences , sentences
) where ) where
import Control.Applicative (many) import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (pack) import Data.Text.Lazy as Text (concat, pack)
import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1) import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine)
fields :: [Field] header :: Header
fields = [ header = [
"ID" "ID"
, "FORM" , "FORM"
, "LEMMA" , "LEMMA"
...@@ -25,10 +25,34 @@ fields = [ ...@@ -25,10 +25,34 @@ fields = [
] ]
field :: Stream s m Char => ParsecT s u m Field field :: Stream s m Char => ParsecT s u m Field
field = Text.pack <$> many1 (noneOf "\t\n\r") field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just $ Text.pack s
type Range = (Int, Int)
rowID :: Stream s m Char => ParsecT s u m (Either Range Int)
rowID = (Left <$> try ((,) <$> int <* char '-' <*> int)) <|> (Right <$> int)
where
int = read <$> many1 digit
row :: Stream s m Char => ParsecT s u m Row row :: Stream s m Char => ParsecT s u m Row
row = recordLine (field `sepBy1` char '\t') <* eol row = recordLine (rowID >>= either range singleLine)
where
toText = Text.pack . show
rest = many (char '\t' *> field) <* eol
singleLine n = (Just (toText n):) <$> rest
range (from, to) =
let rangeID = Text.concat [toText from, "-", toText to] in do
main <- rest
subs <- mapM (\k -> string (show k) *> rest) [from .. to]
return $ (Just rangeID):(zipWith (<|>) main $ foldl1 combineSubs subs)
combineSubs = zipWith combineFields
combineFields Nothing Nothing = Nothing
combineFields Nothing f@(Just _) = f
combineFields f@(Just _) Nothing = f
combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
type Sentence = [Row] type Sentence = [Row]
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
module Text.TEIWA.Source.Common ( module Text.TEIWA.Source.Common (
AnnotationContext(..) AnnotationContext(..)
, Field , Field
, Header
, Row , Row
, TEIWAParser , TEIWAParser
, annotationContext , annotationContext
...@@ -29,7 +30,7 @@ data AnnotationContext = AnnotationContext { ...@@ -29,7 +30,7 @@ data AnnotationContext = AnnotationContext {
, header :: Header , header :: Header
} }
type Field = Text type Field = Maybe Text
type Header = [Text] type Header = [Text]
type Row = (Line, [Field]) type Row = (Line, [Field])
......
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.SSV ( module Text.TEIWA.Source.SSV (
body body
, fields , header
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Data.Maybe (catMaybes)
import Data.Text.Lazy as Text (pack) import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine)
import Text.TEIWA.Error (Error(..))
field :: Stream s m Char => Char -> ParsecT s u m Field field :: Stream s m Char => Char -> ParsecT s u m Field
field separator = Text.pack <$> (regular <|> quoted) field separator = notEmpty <$> (regular <|> quoted)
where where
notEmpty "" = Nothing
notEmpty s = Just $ Text.pack s
regular = many (noneOf $ separator:"\n\r\"") regular = many (noneOf $ separator:"\n\r\"")
quoted = between quote quote $ quoted = between quote quote $
many (noneOf "\"" <|> try (string "\"\"" *> pure '"')) many (noneOf "\"" <|> try (string "\"\"" *> pure '"'))
...@@ -20,5 +25,11 @@ field separator = Text.pack <$> (regular <|> quoted) ...@@ -20,5 +25,11 @@ field separator = Text.pack <$> (regular <|> quoted)
fields :: Stream s m Char => Char -> ParsecT s u m [Field] fields :: Stream s m Char => Char -> ParsecT s u m [Field]
fields separator = (field separator `sepBy` char separator) <* eol fields separator = (field separator `sepBy` char separator) <* eol
header :: Char -> TEIWAParser Header
header separator = fields separator >>= ensureNotEmpty . catMaybes
where
ensureNotEmpty [] = throwError EmptyHeader
ensureNotEmpty l = return l
body :: Stream s m Char => Char -> ParsecT s u m [Row] body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields body = many . recordLine . fields
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment