diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 6c906f780dc252f755627b9c1bf435176fb33343..0858fcb770e1883b9b6215172ff5b099a601fad0 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -19,40 +19,41 @@ import Data.Text.Lazy as Text (Text, toLower, unpack) import Data.Text.Lazy.IO as Text (readFile) import Text.Parsec (SourceName, runParserT) import Text.TEIWA.Annotation ( - Annotation(..), SentenceAnnotation(..), TokenAnnotation(..) + Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..) ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Source.Common ( AnnotationContext(..), Field, Row, TEIWAParser, annotationContext ) -import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (fields, sentences) -import qualified Text.TEIWA.Source.SSV as SSV (body, fields) +import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) +import qualified Text.TEIWA.Source.SSV as SSV (body, header) type Format = TEIWAParser Annotation -isTEI :: Field -> Bool +isTEI :: Text -> Bool isTEI = (`elem` ["pos", "lemma", "msd"]) -filterAttributes :: MonadReader Config m => [(Field, Field)] -> m [(Field, Field)] +filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI where filterIf _ ("", _) = [] - filterIf False (k, v) = [(k, v)] - filterIf True (k, v) = + filterIf _ (_, Nothing) = [] + filterIf False (k, Just v) = [(k, v)] + filterIf True (k, Just v) = let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] annotateToken :: (MonadError Error m, MonadReader Config m) => AnnotationContext -> Row -> m TokenAnnotation annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = case splitAt columnIndex record of - (_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName - (before, form:after) -> + (before, (Just form):after) -> TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) + _ -> throwError . MissingColumn atLine $ Text.unpack columnName coNLLX :: Format coNLLX = do - context <- annotationContext CoNLLX.fields + context <- annotationContext CoNLLX.header SentenceLevel <$> ( CoNLLX.sentences >>= mapM ( fmap SentenceAnnotation . mapM (annotateToken context) @@ -61,7 +62,7 @@ coNLLX = do ssv :: Char -> Format ssv separator = do - context <- annotationContext =<< SSV.fields separator + context <- annotationContext =<< SSV.header separator TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) csv :: Format diff --git a/lib/Text/TEIWA/Source/CoNLLX.hs b/lib/Text/TEIWA/Source/CoNLLX.hs index 5f2d517929f01df8433ca9c60922648c6990b57f..0a7f95733ee7f13ae971648467732475b310db85 100644 --- a/lib/Text/TEIWA/Source/CoNLLX.hs +++ b/lib/Text/TEIWA/Source/CoNLLX.hs @@ -1,17 +1,17 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} module Text.TEIWA.Source.CoNLLX ( - fields + header , sentences ) where -import Control.Applicative (many) -import Data.Text.Lazy as Text (pack) -import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1) -import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) +import Control.Applicative ((<|>), many) +import Data.Text.Lazy as Text (concat, pack) +import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try) +import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine) -fields :: [Field] -fields = [ +header :: Header +header = [ "ID" , "FORM" , "LEMMA" @@ -25,10 +25,34 @@ fields = [ ] field :: Stream s m Char => ParsecT s u m Field -field = Text.pack <$> many1 (noneOf "\t\n\r") +field = build <$> many1 (noneOf "\t\n\r") + where + build "_" = Nothing + build s = Just $ Text.pack s + +type Range = (Int, Int) + +rowID :: Stream s m Char => ParsecT s u m (Either Range Int) +rowID = (Left <$> try ((,) <$> int <* char '-' <*> int)) <|> (Right <$> int) + where + int = read <$> many1 digit row :: Stream s m Char => ParsecT s u m Row -row = recordLine (field `sepBy1` char '\t') <* eol +row = recordLine (rowID >>= either range singleLine) + where + toText = Text.pack . show + rest = many (char '\t' *> field) <* eol + singleLine n = (Just (toText n):) <$> rest + range (from, to) = + let rangeID = Text.concat [toText from, "-", toText to] in do + main <- rest + subs <- mapM (\k -> string (show k) *> rest) [from .. to] + return $ (Just rangeID):(zipWith (<|>) main $ foldl1 combineSubs subs) + combineSubs = zipWith combineFields + combineFields Nothing Nothing = Nothing + combineFields Nothing f@(Just _) = f + combineFields f@(Just _) Nothing = f + combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b] type Sentence = [Row] diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index 83839e9a5463ef6fc5372ca1e64221655c196605..ea0a253deec29af9622fabe844353c6a7e325154 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -4,6 +4,7 @@ module Text.TEIWA.Source.Common ( AnnotationContext(..) , Field + , Header , Row , TEIWAParser , annotationContext @@ -29,7 +30,7 @@ data AnnotationContext = AnnotationContext { , header :: Header } -type Field = Text +type Field = Maybe Text type Header = [Text] type Row = (Line, [Field]) diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs index 43c9e3cbf61f16aedadba5b06df64f63df1b0b10..b973771d5a9fed19bffd934ba0e38ddc11bc44a2 100644 --- a/lib/Text/TEIWA/Source/SSV.hs +++ b/lib/Text/TEIWA/Source/SSV.hs @@ -1,17 +1,22 @@ {-# LANGUAGE FlexibleContexts #-} module Text.TEIWA.Source.SSV ( body - , fields + , header ) where import Control.Applicative ((<|>), many) +import Control.Monad.Except (MonadError(..)) +import Data.Maybe (catMaybes) import Data.Text.Lazy as Text (pack) import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) -import Text.TEIWA.Source.Common (Field, Row, eol, recordLine) +import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine) +import Text.TEIWA.Error (Error(..)) field :: Stream s m Char => Char -> ParsecT s u m Field -field separator = Text.pack <$> (regular <|> quoted) +field separator = notEmpty <$> (regular <|> quoted) where + notEmpty "" = Nothing + notEmpty s = Just $ Text.pack s regular = many (noneOf $ separator:"\n\r\"") quoted = between quote quote $ many (noneOf "\"" <|> try (string "\"\"" *> pure '"')) @@ -20,5 +25,11 @@ field separator = Text.pack <$> (regular <|> quoted) fields :: Stream s m Char => Char -> ParsecT s u m [Field] fields separator = (field separator `sepBy` char separator) <* eol +header :: Char -> TEIWAParser Header +header separator = fields separator >>= ensureNotEmpty . catMaybes + where + ensureNotEmpty [] = throwError EmptyHeader + ensureNotEmpty l = return l + body :: Stream s m Char => Char -> ParsecT s u m [Row] body = many . recordLine . fields