diff --git a/lib/Data/Map/Lazy/Extra.hs b/lib/Data/Map/Lazy/Extra.hs new file mode 100644 index 0000000000000000000000000000000000000000..32fecae1f98bc373ffadb4d49cbe27d51c7920ee --- /dev/null +++ b/lib/Data/Map/Lazy/Extra.hs @@ -0,0 +1,13 @@ +module Data.Map.Lazy.Extra ( + maxFor + ) where + +import Data.Map as Map (Map, foldrWithKey) + +maxFor :: Ord b => (k -> a -> b) -> Map k a -> Maybe (k, a) +maxFor weight = fmap fst . foldrWithKey (\k a -> Just . keepMax k a) Nothing + where + keepMax k a Nothing = ((k, a), weight k a) + keepMax k a (Just current@(_, w)) = + let w' = weight k a in + if w' > w then ((k, a), w') else current diff --git a/lib/Text/TEIWA/Annotation.hs b/lib/Text/TEIWA/Annotation.hs index 8fba39e04b15253e6c8c5ec860be9d80f6974efb..ac76d8241bcfddbeb27f3eeb192be01498180054 100644 --- a/lib/Text/TEIWA/Annotation.hs +++ b/lib/Text/TEIWA/Annotation.hs @@ -1,8 +1,8 @@ {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} module Text.TEIWA.Annotation ( apply - , s_ , tagToken + , teiSentence ) where import Control.Monad.Except (MonadError(..)) @@ -13,7 +13,7 @@ import Text.TEIWA.Annotation.Data (Annotation(..), Node(..), Tag(..)) import Text.TEIWA.Annotation.Editor (annotateWith, start) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common (Attributes, Field, Row) +import Text.TEIWA.Source.Common (Attributes, Field, Numbered) apply :: MonadError Error m => Config -> Annotation -> Text -> m Text apply config annotation = @@ -22,8 +22,8 @@ apply config annotation = isTEI :: Text -> Bool isTEI = (`elem` ["pos", "lemma", "msd"]) -s_ :: Tag -s_ = Tag {name = "s", annotated = []} +teiSentence :: [Node] -> Node +teiSentence = Node (Tag {name = "s", annotated = []}) . Annotations filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI @@ -35,7 +35,7 @@ filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] tagToken :: (MonadError Error m, MonadReader Config m) => - Context -> Row -> m Node + Context -> Numbered [Field] -> m Node tagToken (Context {columnIndex, columnName, header, tagger}) (atLine, record) = case splitAt columnIndex record of (before, (Just form):after) -> do diff --git a/lib/Text/TEIWA/Error.hs b/lib/Text/TEIWA/Error.hs index 2cb59403148ae3a1dcd177ed6a79b41fd46fcd81..6fd86c38288ae06b9dd7c2c91d64898c542e3cc6 100644 --- a/lib/Text/TEIWA/Error.hs +++ b/lib/Text/TEIWA/Error.hs @@ -12,6 +12,7 @@ data Error = | MissingColumn Line String | ParsingError ParseError | TermNotFound String + | OverlappingAnnotation (Line, String) (Line, String) instance Show Error where show (NoSuchColumn s) = @@ -22,3 +23,5 @@ instance Show Error where show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s show (ParsingError e) = show e show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t + show (OverlappingAnnotation (l1, s1) (l2, s2)) = + printf "Annotations \"%s\" (l.%d) and \"%s\" (l.%d) overlap" s1 l1 s2 l2 diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index 4fcb725c716b911947d5552e77fbfecff2b9d75a..7a807af082f209b2aee0302b1a22018930ec18f0 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -1,6 +1,4 @@ -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} module Text.TEIWA.Source ( Format , Origin(..) diff --git a/lib/Text/TEIWA/Source/CoNLLX.hs b/lib/Text/TEIWA/Source/CoNLLX.hs index 28b4b490d7c0c778b2a63299b6760a9521f5c740..38baa7d2e462ccf0bb3fc99bb7a5d9a662b3e556 100644 --- a/lib/Text/TEIWA/Source/CoNLLX.hs +++ b/lib/Text/TEIWA/Source/CoNLLX.hs @@ -5,11 +5,11 @@ module Text.TEIWA.Source.CoNLLX ( import Control.Applicative ((<|>), many) import Data.Text.Lazy as Text (concat, pack) -import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try) -import Text.TEIWA.Annotation (s_, tagToken) -import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..)) +import Text.Parsec (ParsecT, Stream, char, string) +import Text.TEIWA.Annotation (tagToken, teiSentence) +import Text.TEIWA.Annotation.Data (Annotation(..), Format) import Text.TEIWA.Annotation.Context as Context (ofHeader) -import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine, sentences) +import Text.TEIWA.Source.Common (Field, Header, avoid, eol, int, sentences) header :: Header header = [ @@ -26,20 +26,18 @@ header = [ ] field :: Stream s m Char => ParsecT s u m Field -field = build <$> many1 (noneOf "\t\n\r") +field = build <$> avoid "\t\n\r" where build "_" = Nothing - build other = Just $ Text.pack other + build other = Just other type Range = (Int, Int) rowID :: Stream s m Char => ParsecT s u m (Either Range Int) -rowID = (Left <$> try ((,) <$> int <* char '-' <*> int)) <|> (Right <$> int) - where - int = read <$> many1 digit +rowID = int >>= \n -> ((char '-' *> (Left . (,) n <$> int)) <|> pure (Right n)) -row :: Stream s m Char => ParsecT s u m Row -row = recordLine (rowID >>= either range singleLine) +row :: Stream s m Char => ParsecT s u m [Field] +row = rowID >>= either range singleLine where toText = Text.pack . show rest = many (char '\t' *> field) <* eol @@ -60,6 +58,6 @@ coNLLX = do context <- Context.ofHeader header Annotations <$> ( sentences row >>= mapM ( - fmap (Node s_ . Annotations) . mapM (tagToken context) + fmap teiSentence . mapM (tagToken context) ) ) diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index ba31db2fbf7ba941144331b7b012fd515d1299a7..3f4c0533253d721c36bf8af3c0dc71ede0353953 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -3,30 +3,31 @@ module Text.TEIWA.Source.Common ( Attributes , Field , Header - , Row + , Numbered , TEIWAParser , attribute + , avoid , eol + , int , recordLine , sentences , teiTagger ) where -import Control.Applicative ((<|>), many) +import Control.Applicative ((<|>), many, optional) import Control.Monad.Reader (ReaderT) import Data.Char (isPunctuation) -import Data.Text.Lazy as Text (Text, concat, head, length) +import Data.Text.Lazy as Text (Text, concat, head, length, pack) import Text.Parsec ( - Line, ParsecT, Stream, char, endOfLine, getParserState, many1, noneOf - , sepEndBy, sourceLine, statePos, try + Line, ParsecT, Stream, char, digit, endOfLine, getParserState, many1 + , noneOf, sepEndBy, sourceLine, statePos, try ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) type Field = Maybe Text type Header = [Text] -type Row = (Line, [Field]) -type Sentence = [Row] +type Numbered a = (Line, a) type Attributes = [(Text, Text)] type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) @@ -34,21 +35,29 @@ type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) eol :: Stream s m Char => ParsecT s u m () eol = (try endOfLine <|> char '\r') *> return () -recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a) +int :: Stream s m Char => ParsecT s u m Int +int = read <$> many1 digit + +avoid :: Stream s m Char => [Char] -> ParsecT s u m Text +avoid characters = Text.pack <$> many1 (noneOf characters) + +currentLine :: Monad m => ParsecT s u m Line +currentLine = sourceLine . statePos <$> getParserState + +recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Numbered a) recordLine p = (,) <$> currentLine <*> p - where - currentLine = sourceLine . statePos <$> getParserState attribute :: (Text, Text) -> Text attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""] -sentence :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m Sentence -sentence row = many comment *> many1 row +sentence :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [Numbered p] +sentence row = many comment *> many1 (recordLine row) where - comment = char '#' *> many (noneOf "\r\n") <* eol + comment = char '#' *> optional (avoid "\r\n") *> eol -sentences :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m [Sentence] +sentences :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [[Numbered p]] sentences row = sentence row `sepEndBy` many1 eol +--sentences row = many (many1 eol *> sentence row) teiTagger :: Text -> Attributes -> Text teiTagger t _ diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs index ed5d3e0a082f8ca98bc4423077f8cbfe1b3848bb..6690374aa035df34912c95c91ab802980983508f 100644 --- a/lib/Text/TEIWA/Source/SSV.hs +++ b/lib/Text/TEIWA/Source/SSV.hs @@ -11,7 +11,9 @@ import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) import Text.TEIWA.Annotation (tagToken) import Text.TEIWA.Annotation.Context as Context (ofHeader) import Text.TEIWA.Annotation.Data (Annotation(..), Format) -import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine) +import Text.TEIWA.Source.Common ( + Field, Header, Numbered, TEIWAParser, eol, recordLine + ) import Text.TEIWA.Error (Error(..)) field :: Stream s m Char => Char -> ParsecT s u m Field @@ -33,7 +35,7 @@ header separator = fields separator >>= ensureNotEmpty . catMaybes ensureNotEmpty [] = throwError EmptyHeader ensureNotEmpty l = return l -body :: Stream s m Char => Char -> ParsecT s u m [Row] +body :: Stream s m Char => Char -> ParsecT s u m [Numbered [Field]] body = many . recordLine . fields ssv :: Char -> Format diff --git a/lib/Text/TEIWA/Source/WebAnno.hs b/lib/Text/TEIWA/Source/WebAnno.hs index f2f905b3247846c69f4f38a79f52494eaea619bf..df8447e412200fbe3e0a15295cb284e031a768c2 100644 --- a/lib/Text/TEIWA/Source/WebAnno.hs +++ b/lib/Text/TEIWA/Source/WebAnno.hs @@ -1,52 +1,94 @@ -{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} -module Text.TEIWA.Source.WebAnno ( +{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} +module Text.TEIWA.Source.WebAnno {-( webAnno - ) where + )-} where -import Control.Applicative (many) -import Data.Text.Lazy as Text (pack) +import Control.Applicative ((<|>), many, optional) +import Control.Monad.Except (MonadError) +import Data.Either (partitionEithers) +import Data.Map as Map (Map, empty, insertWith) +import Data.Map.Lazy.Extra as Map (maxFor) +import Data.List (foldl') +import Data.Text.Lazy as Text (Text, pack) import Text.Parsec ( - ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string + Line, ParsecT, Stream, anyChar, char, count, many1, noneOf, sepBy1, skipMany1 + , string ) -import Text.TEIWA.Annotation (s_, tagToken) -import Text.TEIWA.Annotation.Context as Context (ofHeader) +import Text.TEIWA.Annotation (tagToken, teiSentence) +import Text.TEIWA.Annotation.Context (Context) +import qualified Text.TEIWA.Annotation.Context as Context (Context(..), ofHeader) import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..)) +import Text.TEIWA.Error (Error) import Text.TEIWA.Source.Common ( - Field, Header, Row, TEIWAParser, eol, recordLine, sentences + Field, Header, Numbered, TEIWAParser, avoid, eol, int, recordLine, sentences ) -header :: TEIWAParser Header +pragma :: Stream s m Char => ParsecT s u m Text +pragma = char '#' *> avoid "=" <* char '=' + +header :: Stream s m Char => ParsecT s u m Header header = do - string "#FORMAT=WebAnno TSV " *> version - many comment *> pure () - pure [ - "ID" - , "SPAN" - , "FORM" - , "LEMMA" - , "POS" - ] + pragma *> string "WebAnno TSV " *> int `sepBy1` char '.' *> eol + --concat <$> (pragma >>= columns) `sepBy1` eol + ("FORM":) . concat <$> many ((pragma >>= columns) <* eol) where - version = skipMany1 digit `sepBy1` char '.' *> eol - comment = char '#' *> many (noneOf "\r\n") <* eol + columns "T_SP" = drop 1 <$> avoid "|\r\n" `sepBy1` char '|' + columns _ = avoid "\r\n" *> pure [] + +reserved :: [Char] +reserved = "\\[]|_-;\t\n*" + +type TokenAnnotation = (Text, Int) -field :: Stream s m Char => ParsecT s u m Field -field = build <$> many1 (noneOf "\t\n\r") +field :: Stream s m Char => ParsecT s u m [TokenAnnotation] +field = noValue <|> component `sepBy1` (char '|') where - build "_" = Nothing - build other = Just . Text.pack $ unescape other - unescape "" = "" - unescape ('\\':c:cs) = c:unescape cs - unescape (c:cs) = c:unescape cs + noValue = char '_' *> pure [] + uniqueID = char '[' *> int <* char ']' + component = (,) + <$> (Text.pack <$> many1 (noneOf reserved <|> (char '\\' *> anyChar))) + <*> (maybe 0 id <$> optional uniqueID) + +data Row = Row { + form :: Text + , fields :: [[TokenAnnotation]] + } row :: Stream s m Char => ParsecT s u m Row -row = recordLine (many1 (field <* char '\t') <* eol) +row = makeRow <$> count 3 column <*> many1 (field <* char '\t') <* eol + where + column = avoid "\t\n" <* char '\t' + makeRow columns fields = Row {form = columns !! 2, fields} + +getSpans :: Context -> [Numbered Row] -> Map (Text, TokenAnnotation) (Line, Line) +getSpans context numbered = + foldl' (\tmpMap (lineNumber, row) -> + foldl' (\tmpMap' (hK, labels) -> + foldr (\label -> + Map.insertWith extendSpan (hK, label) (lineNumber, lineNumber) + ) tmpMap' labels + ) tmpMap row + ) Map.empty byHeader + where + byHeader = fmap (zip (Context.header context) . fields) <$> numbered + extendSpan (_, newMax) (oldMin, _) = (oldMin, newMax) + +extractNodes :: MonadError Error m => Context -> [Numbered Row] -> m [Node] +extractNodes _ [] = pure [] +extractNodes context numbered = undefined + where + spans = getSpans context numbered + maximalSpan = maxFor (\_ (minL, maxL) -> maxL - minL) + +{- +extractNodes context ((lineNumber, Row {fields, spans = []}):others) = + tagToken (lineNumber, fields) : extractNodes context others +extractNode context ((lineNumber, Row {fields, spans}):others) = undefined +-} webAnno :: Format webAnno = do context <- Context.ofHeader =<< header - Annotations <$> ( - (many eol *> sentences row) >>= mapM ( - fmap (Node s_ . Annotations) . mapM (tagToken context) - ) + Annotations . fmap teiSentence <$> ( + (many eol *> sentences row) >>= mapM (extractNodes context) ) diff --git a/teiwa.cabal b/teiwa.cabal index e810e7a55b5991ef00e717f478c73c654e6f26e8..279b75cc92a623cd91557920dc73c8711e739e32 100644 --- a/teiwa.cabal +++ b/teiwa.cabal @@ -25,7 +25,8 @@ extra-source-files: CHANGELOG.md library exposed-modules: Text.TEIWA , Text.TEIWA.Annotation.Data - other-modules: Text.TEIWA.Annotation + other-modules: Data.Map.Lazy.Extra + , Text.TEIWA.Annotation , Text.TEIWA.Annotation.Context , Text.TEIWA.Annotation.Editor , Text.TEIWA.Config @@ -37,6 +38,7 @@ library , Text.TEIWA.Source.WebAnno build-depends: base >=4.12 && <4.15 , bytestring + , containers , mtl , parsec , text