Skip to content
Snippets Groups Projects
Commit f8a0f4b1 authored by Alice Brenon's avatar Alice Brenon
Browse files

Finish implementing the annotation-building process for WebAnno + add a lot of...

Finish implementing the annotation-building process for WebAnno + add a lot of unit and regression tests
parent d62768cd
No related branches found
No related tags found
No related merge requests found
Showing
with 584 additions and 117 deletions
module Data.Map.Lazy.Extra ( module Data.Map.Lazy.Extra (
maxFor foldMWithKey
, maxFor
, splitOn
) where ) where
import Data.Map as Map (Map, foldrWithKey) import Control.Monad.State (evalState, get, state)
import Data.Map as Map (Map, foldlWithKey', foldrWithKey, spanAntitone)
maxFor :: Ord b => (k -> a -> b) -> Map k a -> Maybe (k, a) -- | Evaluate a function on each entry contained in a map to find its maximum.
maxFor weight = fmap fst . foldrWithKey (\k a -> Just . keepMax k a) Nothing -- It returns not only the key and input value but also the maximum reached
maxFor :: Ord b => (k -> a -> b) -> Map k a -> Maybe (k, a, b)
maxFor weight = foldrWithKey (\k a -> Just . keepMax k a) Nothing
where where
keepMax k a Nothing = ((k, a), weight k a) keepMax k a Nothing = (k, a, weight k a)
keepMax k a (Just current@(_, w)) = keepMax k a (Just current@(_, _, w)) =
let w' = weight k a in let w' = weight k a in
if w' > w then ((k, a), w') else current if w' > w then (k, a, w') else current
foldMWithKey :: Monad m => (b -> k -> a -> m b) -> b -> Map k a -> m b
foldMWithKey f = foldlWithKey' (\m k v -> m >>= (\a -> f a k v)) . pure
-- | Given two keys '(from, to)', cuts a map in three:
-- + the submap containing the keys lower than 'from' and their values
-- + the submap of keys between 'from' and 'to' (inclusive)
-- + the submap of keys strictly greater than 'to'
splitOn :: Ord k => (k, k) -> Map k a -> (Map k a, Map k a, Map k a)
splitOn (from, to) = evalState ((,,) <$> pop (< from) <*> pop (<= to) <*> get)
where
pop = state . spanAntitone
module Data.Tuple.Extra (
onEach
, zipTriple
) where
zipTriple :: (a -> b -> c) -> (a, a, a) -> (b, b, b) -> (c, c, c)
zipTriple f (a0, b0, c0) (a1, b1, c1) = (f a0 a1, f b0 b1, f c0 c1)
onEach :: Applicative f => (a -> f b) -> (a, a, a) -> f (b, b, b)
onEach f (a, b, c) = (,,) <$> f a <*> f b <*> f c
...@@ -19,6 +19,8 @@ apply :: MonadError Error m => Config -> Annotation -> Text -> m Text ...@@ -19,6 +19,8 @@ apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
apply config annotation = apply config annotation =
fmap snd . evalRWST (annotateWith annotation) config . start fmap snd . evalRWST (annotateWith annotation) config . start
-- | TEI-C defines a Lightweight Linguistic Annotation framework
-- https://tei-c.org/release/doc/tei-p5-doc/en/html/AI.html#AILALW
isTEI :: Text -> Bool isTEI :: Text -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"]) isTEI = (`elem` ["pos", "lemma", "msd"])
......
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Annotation.Data ( module Text.TEIWA.Annotation.Data (
Annotation(..) Annotation(..)
, Format , Format
...@@ -9,7 +9,7 @@ module Text.TEIWA.Annotation.Data ( ...@@ -9,7 +9,7 @@ module Text.TEIWA.Annotation.Data (
) where ) where
import Data.Text.Lazy as Text (Text, concat, pack) import Data.Text.Lazy as Text (Text, concat, pack)
import Text.TEIWA.Source.Common (Attributes, TEIWAParser, attribute) import Text.TEIWA.Source.Common (Attributes, TEIWAParser)
import Text.Printf (printf) import Text.Printf (printf)
data Tag = Tag { data Tag = Tag {
...@@ -29,3 +29,6 @@ openTag (Tag {name, annotated}) = ...@@ -29,3 +29,6 @@ openTag (Tag {name, annotated}) =
closeTag :: Tag -> Text closeTag :: Tag -> Text
closeTag = Text.pack . printf "</%s>" . name closeTag = Text.pack . printf "</%s>" . name
attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
...@@ -13,6 +13,8 @@ data Error = ...@@ -13,6 +13,8 @@ data Error =
| ParsingError ParseError | ParsingError ParseError
| TermNotFound String | TermNotFound String
| OverlappingAnnotation (Line, String) (Line, String) | OverlappingAnnotation (Line, String) (Line, String)
| Debug String
deriving (Eq)
instance Show Error where instance Show Error where
show (NoSuchColumn s) = show (NoSuchColumn s) =
...@@ -25,3 +27,5 @@ instance Show Error where ...@@ -25,3 +27,5 @@ instance Show Error where
show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
show (OverlappingAnnotation (l1, s1) (l2, s2)) = show (OverlappingAnnotation (l1, s1) (l2, s2)) =
printf "Annotations \"%s\" (l.%d) and \"%s\" (l.%d) overlap" s1 l1 s2 l2 printf "Annotations \"%s\" (l.%d) and \"%s\" (l.%d) overlap" s1 l1 s2 l2
show (Debug s) =
printf "DEBUG: %s" s
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings, ScopedTypeVariables #-}
module Text.TEIWA.Source ( module Text.TEIWA.Source (
Format Format
, Origin(..) , Origin(..)
......
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.CoNLLX ( module Text.TEIWA.Source.CoNLLX (
coNLLX coNLLX
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (concat, pack) import Data.Text.Lazy as Text (Text, concat)
import Text.Parsec (ParsecT, Stream, char, string) import Text.Parsec (ParsecT, Stream, char, string)
import Text.TEIWA.Annotation (tagToken, teiSentence) import Text.TEIWA.Annotation (tagToken, teiSentence)
import Text.TEIWA.Annotation.Data (Annotation(..), Format) import Text.TEIWA.Annotation.Data (Annotation(..), Format)
import Text.TEIWA.Annotation.Context as Context (ofHeader) import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Source.Common (Field, Header, avoid, eol, int, sentences) import Text.TEIWA.Source.Common (
Field, Header, Range(..), avoid, eol, int, range, sentences, toField
)
header :: Header header :: Header
header = [ header = [
...@@ -25,34 +27,29 @@ header = [ ...@@ -25,34 +27,29 @@ header = [
, "PDEPREL" , "PDEPREL"
] ]
field :: Stream s m Char => ParsecT s u m Field fields :: Stream s m Char => ParsecT s u m [Field]
field = build <$> avoid "\t\n\r" fields = many (char '\t' *> (build <$> avoid "\t\n\r")) <* eol
where where
build "_" = Nothing build "_" = Nothing
build other = Just other build other = Just other
type Range = (Int, Int) combineSubs :: [Maybe Text] -> [Maybe Text] -> [Maybe Text]
combineSubs = zipWith combineFields
rowID :: Stream s m Char => ParsecT s u m (Either Range Int)
rowID = int >>= \n -> ((char '-' *> (Left . (,) n <$> int)) <|> pure (Right n))
row :: Stream s m Char => ParsecT s u m [Field]
row = rowID >>= either range singleLine
where where
toText = Text.pack . show
rest = many (char '\t' *> field) <* eol
singleLine n = (Just (toText n):) <$> rest
range (from, to) =
let rangeID = Text.concat [toText from, "-", toText to] in do
main <- rest
subs <- mapM (\k -> string (show k) *> rest) [from .. to]
return $ (Just rangeID):(zipWith (<|>) main $ foldl1 combineSubs subs)
combineSubs = zipWith combineFields
combineFields Nothing Nothing = Nothing combineFields Nothing Nothing = Nothing
combineFields Nothing f@(Just _) = f combineFields Nothing f@(Just _) = f
combineFields f@(Just _) Nothing = f combineFields f@(Just _) Nothing = f
combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b] combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
row :: Stream s m Char => ParsecT s u m [Field]
row = (range >>= rangeLines) <|> (int >>= singleLine)
where
singleLine n = (toField n:) <$> fields
rangeLines r@(Range {from, to}) = do
main <- fields
subs <- mapM (\k -> string (show k) *> fields) [from .. to]
pure $ (toField r):(zipWith (<|>) main $ foldl1 combineSubs subs)
coNLLX :: Format coNLLX :: Format
coNLLX = do coNLLX = do
context <- Context.ofHeader header context <- Context.ofHeader header
......
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.Common ( module Text.TEIWA.Source.Common (
Attributes Attributes
, Field , Field
, Header , Header
, Numbered , Numbered
, Range(..)
, TEIWAParser , TEIWAParser
, attribute
, avoid , avoid
, eol , eol
, hull
, int , int
, range
, recordLine , recordLine
, sentences , sentences
, size
, teiTagger , teiTagger
, toField
, within
) where ) where
import Control.Applicative ((<|>), many, optional) import Control.Applicative ((<|>), many, optional)
import Control.Monad.Reader (ReaderT) import Control.Monad.Reader (ReaderT)
import Data.Char (isPunctuation) import Data.Char (isPunctuation)
import Data.Text.Lazy as Text (Text, concat, head, length, pack) import Data.Text.Lazy as Text (Text, head, length, pack)
import Text.Parsec ( import Text.Parsec (
Line, ParsecT, Stream, char, digit, endOfLine, getParserState, many1 Line, ParsecT, Stream, char, digit, endOfLine, getParserState, many1
, noneOf, sepEndBy, sourceLine, statePos, try , noneOf, sepEndBy, sourceLine, statePos, try
...@@ -27,6 +32,24 @@ import Text.TEIWA.Error (Error(..)) ...@@ -27,6 +32,24 @@ import Text.TEIWA.Error (Error(..))
type Field = Maybe Text type Field = Maybe Text
type Header = [Text] type Header = [Text]
data Range = Range {
from :: Int
, to :: Int
} deriving (Eq)
instance Show Range where
show (Range {from, to}) = show from ++ "-" ++ show to
within :: Int -> Range -> Bool
within i (Range {from, to}) = from <= i && i <= to
hull :: Range -> Range -> Range
hull (Range newMin newMax) (Range oldMin oldMax) =
Range (min newMin oldMin) (max newMax oldMax)
size :: Range -> Int
size (Range {from, to}) = to - from
type Numbered a = (Line, a) type Numbered a = (Line, a)
type Attributes = [(Text, Text)] type Attributes = [(Text, Text)]
...@@ -47,9 +70,6 @@ currentLine = sourceLine . statePos <$> getParserState ...@@ -47,9 +70,6 @@ currentLine = sourceLine . statePos <$> getParserState
recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Numbered a) recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Numbered a)
recordLine p = (,) <$> currentLine <*> p recordLine p = (,) <$> currentLine <*> p
attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
sentence :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [Numbered p] sentence :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [Numbered p]
sentence row = many comment *> many1 (recordLine row) sentence row = many comment *> many1 (recordLine row)
where where
...@@ -57,9 +77,15 @@ sentence row = many comment *> many1 (recordLine row) ...@@ -57,9 +77,15 @@ sentence row = many comment *> many1 (recordLine row)
sentences :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [[Numbered p]] sentences :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [[Numbered p]]
sentences row = sentence row `sepEndBy` many1 eol sentences row = sentence row `sepEndBy` many1 eol
--sentences row = many (many1 eol *> sentence row)
range :: Stream s m Char => ParsecT s u m Range
range = try (Range <$> int <* char '-' <*> int)
toField :: Show a => a -> Field
toField = Just . Text.pack . show
teiTagger :: Text -> Attributes -> Text teiTagger :: Text -> Attributes -> Text
teiTagger t _ teiTagger t _
| Text.length t == 1 && isPunctuation (Text.head t) = "pc" | Text.length t == 1 && isPunctuation (Text.head t) = "pc"
| Text.length t == 0 = "rs"
| otherwise = "w" | otherwise = "w"
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.WebAnno {-( module Text.TEIWA.Source.WebAnno (
webAnno webAnno
)-} where ) where
import Control.Applicative ((<|>), many, optional) import Control.Applicative ((<|>), many, optional)
import Control.Monad.Except (MonadError)
import Data.Either (partitionEithers)
import Data.Map as Map (Map, empty, insertWith)
import Data.Map.Lazy.Extra as Map (maxFor)
import Data.List (foldl')
import Data.Text.Lazy as Text (Text, pack) import Data.Text.Lazy as Text (Text, pack)
import Text.Parsec ( import Text.Parsec (
Line, ParsecT, Stream, anyChar, char, count, many1, noneOf, sepBy1, skipMany1 ParsecT, Stream, anyChar, char, many1, noneOf, sepBy1, string
, string
)
import Text.TEIWA.Annotation (tagToken, teiSentence)
import Text.TEIWA.Annotation.Context (Context)
import qualified Text.TEIWA.Annotation.Context as Context (Context(..), ofHeader)
import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..))
import Text.TEIWA.Error (Error)
import Text.TEIWA.Source.Common (
Field, Header, Numbered, TEIWAParser, avoid, eol, int, recordLine, sentences
) )
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Annotation.Data (Format)
import Text.TEIWA.Source.Common (Header, avoid, eol, int, range, sentences)
import Text.TEIWA.Source.WebAnno.Annotator (annotate)
import Text.TEIWA.Source.WebAnno.Data (AnnotationValue(..), Row(..))
pragma :: Stream s m Char => ParsecT s u m Text pragma :: Stream s m Char => ParsecT s u m Text
pragma = char '#' *> avoid "=" <* char '=' pragma = char '#' *> avoid "=" <* char '='
...@@ -29,8 +20,7 @@ pragma = char '#' *> avoid "=" <* char '=' ...@@ -29,8 +20,7 @@ pragma = char '#' *> avoid "=" <* char '='
header :: Stream s m Char => ParsecT s u m Header header :: Stream s m Char => ParsecT s u m Header
header = do header = do
pragma *> string "WebAnno TSV " *> int `sepBy1` char '.' *> eol pragma *> string "WebAnno TSV " *> int `sepBy1` char '.' *> eol
--concat <$> (pragma >>= columns) `sepBy1` eol concat . (["ID", "SPAN", "FORM"]:) <$> many ((pragma >>= columns) <* eol)
("FORM":) . concat <$> many ((pragma >>= columns) <* eol)
where where
columns "T_SP" = drop 1 <$> avoid "|\r\n" `sepBy1` char '|' columns "T_SP" = drop 1 <$> avoid "|\r\n" `sepBy1` char '|'
columns _ = avoid "\r\n" *> pure [] columns _ = avoid "\r\n" *> pure []
...@@ -38,57 +28,23 @@ header = do ...@@ -38,57 +28,23 @@ header = do
reserved :: [Char] reserved :: [Char]
reserved = "\\[]|_-;\t\n*" reserved = "\\[]|_-;\t\n*"
type TokenAnnotation = (Text, Int) field :: Stream s m Char => ParsecT s u m [AnnotationValue]
field :: Stream s m Char => ParsecT s u m [TokenAnnotation]
field = noValue <|> component `sepBy1` (char '|') field = noValue <|> component `sepBy1` (char '|')
where where
noValue = char '_' *> pure [] noValue = char '_' *> pure []
uniqueID = char '[' *> int <* char ']' bracketedID = char '[' *> int <* char ']'
component = (,) component = AnnotationValue
<$> (Text.pack <$> many1 (noneOf reserved <|> (char '\\' *> anyChar))) <$> (Text.pack <$> many1 (noneOf reserved <|> (char '\\' *> anyChar)))
<*> (maybe 0 id <$> optional uniqueID) <*> (optional bracketedID)
data Row = Row {
form :: Text
, fields :: [[TokenAnnotation]]
}
row :: Stream s m Char => ParsecT s u m Row row :: Stream s m Char => ParsecT s u m Row
row = makeRow <$> count 3 column <*> many1 (field <* char '\t') <* eol row = Row
where <$> (range <* char '\t')
column = avoid "\t\n" <* char '\t' <*> (range <* char '\t')
makeRow columns fields = Row {form = columns !! 2, fields} <*> (avoid "\t\n" <* char '\t')
<*> many1 (field <* char '\t') <* eol
getSpans :: Context -> [Numbered Row] -> Map (Text, TokenAnnotation) (Line, Line)
getSpans context numbered =
foldl' (\tmpMap (lineNumber, row) ->
foldl' (\tmpMap' (hK, labels) ->
foldr (\label ->
Map.insertWith extendSpan (hK, label) (lineNumber, lineNumber)
) tmpMap' labels
) tmpMap row
) Map.empty byHeader
where
byHeader = fmap (zip (Context.header context) . fields) <$> numbered
extendSpan (_, newMax) (oldMin, _) = (oldMin, newMax)
extractNodes :: MonadError Error m => Context -> [Numbered Row] -> m [Node]
extractNodes _ [] = pure []
extractNodes context numbered = undefined
where
spans = getSpans context numbered
maximalSpan = maxFor (\_ (minL, maxL) -> maxL - minL)
{-
extractNodes context ((lineNumber, Row {fields, spans = []}):others) =
tagToken (lineNumber, fields) : extractNodes context others
extractNode context ((lineNumber, Row {fields, spans}):others) = undefined
-}
webAnno :: Format webAnno :: Format
webAnno = do webAnno = do
context <- Context.ofHeader =<< header context <- Context.ofHeader =<< header
Annotations . fmap teiSentence <$> ( (many eol *> sentences row) >>= annotate context
(many eol *> sentences row) >>= mapM (extractNodes context)
)
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Source.WebAnno.Annotator (
AnnotationDB(..)
, UniqueCoordinate(..)
, annotate
, buildTree
, index
, indexCell
, splitDB
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.RWS (MonadReader)
import Data.Map as Map (Map, delete, empty, insert, insertWith, toList)
import Data.Map.Lazy.Extra as Map (foldMWithKey, maxFor, splitOn)
import Data.List as List (foldl')
import Data.Text.Lazy as Text (Text, intercalate)
import Data.Tuple.Extra (onEach, zipTriple)
import Text.Parsec (Line)
import Text.TEIWA.Annotation (tagToken, teiSentence)
import Text.TEIWA.Annotation.Context (Context)
import qualified Text.TEIWA.Annotation.Context as Context (Context(..))
import Text.TEIWA.Annotation.Data (Annotation(..), Node(..), Tag(..))
import Text.TEIWA.Config (Config)
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
Field, Numbered, Range(..), hull , size, toField
)
import Text.TEIWA.Source.WebAnno.Data (
AnnotationDB(..), AnnotationValue(..), Row(..), UniqueCoordinate(..)
)
-- | Each cell in a WebAnno file may contain several 'AnnotationValue's which
-- are indexed separately:
-- + the multiline ones, with an ID, get pushed to the 'Range' map
-- + the others are simply joined by a "+" and become a 'Field' in the row
indexCell :: Line -> (Map UniqueCoordinate Range, [Field]) -> (Text, [AnnotationValue])
-> (Map UniqueCoordinate Range, [Field])
indexCell line (db, lineFields) (tagName, annotationValues) =
let (newDB, tags) = foldr switch (db, []) annotationValues in
(newDB, (fieldOfTags tags):lineFields)
where
switch (AnnotationValue newTag Nothing) (tmpDB, tmpTags) =
(tmpDB, newTag:tmpTags)
switch (AnnotationValue value (Just uniqueID)) (tmpDB, tmpTags) =
let uniqueCoords = UniqueCoordinate {tagName, value, uniqueID} in
(Map.insertWith hull uniqueCoords (Range line line) tmpDB, tmpTags)
fieldOfTags [] = Nothing
fieldOfTags tags = Just $ Text.intercalate "+" tags
-- | Given a 'Context' c to provide the header fields, we can now index a
-- '[[Numbered Rows]]' into an 'AnnotationDB' by folding twice on the rows, and
-- for each one build the corresponding line by folding the previous function on
-- each cell
index :: Context -> [[Numbered Row]] -> AnnotationDB
index context = foldl' (foldl' indexLine) (AnnotationDB Map.empty Map.empty)
where
indexLine db (line, Row {rowId, charSpan, form, fields}) =
let cells = zip (drop 2 $ Context.header context) fields
-- FORM column will already have been removed by Context.ofHeader so we
-- need to drop only 2 fields (ID and SPAN, which are built-in for this
-- format)
tmpPair = (multiRows db, [Just form, toField charSpan, toField rowId])
(newMRs, newLine) = foldl' (indexCell line) tmpPair cells in
AnnotationDB newMRs (Map.insert line (reverse newLine) $ lineFields db)
-- | As the annotation tree will be built recursively, we will need to split the
-- DB at each step into the DBs of lines strictly before the maximal multiline
-- annotation found, the lines contained within it and the lines strictly after
-- it
splitDB :: MonadError Error m => (UniqueCoordinate, Range) -> AnnotationDB ->
m (AnnotationDB, AnnotationDB, AnnotationDB)
splitDB (maxCoords, Range {from, to}) (AnnotationDB {multiRows, lineFields}) =
zipTriple (flip AnnotationDB)
(splitOn (from, to) lineFields)
<$> (foldMWithKey classify emptyTriple $ Map.delete maxCoords multiRows)
where
classify (lower, within, greater) k r@(Range otherFrom otherTo)
| otherTo < from = pure (Map.insert k r lower, within, greater)
| to < otherFrom = pure (lower, within, Map.insert k r greater)
| from <= otherFrom && otherTo <= to =
pure (lower, Map.insert k r within, greater)
| otherwise = throwError $
OverlappingAnnotation (from, show maxCoords) (otherFrom, show k)
emptyTriple = (Map.empty, Map.empty, Map.empty)
-- | Once the annotations database is built, an 'Annotation' tree may be
-- retrieved by taking the multiline annotations by decreasing inclusion order,
-- and outputting the simple rows for each interval the delimitate
buildTree :: (MonadError Error m, MonadReader Config m) =>
Context -> AnnotationDB -> m [Node]
buildTree context db@(AnnotationDB {multiRows, lineFields}) =
case maxFor (\_ -> size) multiRows of
Just (maxCoords@(UniqueCoordinate {tagName, value}), bounds, _) -> do
shards <- splitDB (maxCoords, bounds) db
(lower, within, greater) <- onEach (buildTree context) shards
let annotated = [(tagName, value)]
let name = (Context.tagger context) "" annotated
let n = Node (Tag {name, annotated}) (Annotations within)
pure (lower ++ (n:greater))
_ -> mapM (tagToken context) $ Map.toList lineFields
annotate :: (MonadError Error m, MonadReader Config m) =>
Context -> [[Numbered Row]] -> m Annotation
annotate context = fmap Annotations . buildTree context . index context
{-# LANGUAGE NamedFieldPuns #-}
module Text.TEIWA.Source.WebAnno.Data (
AnnotationDB(..)
, AnnotationValue(..)
, Row(..)
, UniqueCoordinate(..)
) where
import Data.Map as Map (Map)
import Data.Text.Lazy (Text, unpack)
import Text.Parsec (Line)
import Text.TEIWA.Source.Common (Field, Range)
data AnnotationValue = AnnotationValue {
labelTag :: Text
, labelId :: Maybe Int
} deriving (Eq, Ord)
instance Show AnnotationValue where
show (AnnotationValue {labelTag, labelId}) =
unpack labelTag ++ maybe "" show labelId
data Row = Row {
rowId :: Range
, charSpan :: Range
, form :: Text
, fields :: [[AnnotationValue]]
} deriving (Eq, Show)
-- | Since annotation tags can be found on several lines, they are distinguished
-- | with their numeric ID written within brackets.
-- | It entails that a pair containing a tag name and a value isn't unique, so
-- | we need to define a type for unique coordinates:
data UniqueCoordinate = UniqueCoordinate {
tagName :: Text
, value :: Text
, uniqueID :: Int
} deriving (Eq, Ord, Show)
-- | We can now define a type representing the indexed data for an annotation,
-- separating the annotations spanning on several rows from the single-row
-- ones which are much similar to the types found in the other supported
-- formats.
data AnnotationDB = AnnotationDB {
multiRows :: Map UniqueCoordinate Range
, lineFields :: Map Line [Field]
} deriving (Eq, Show)
...@@ -23,19 +23,22 @@ build-type: Simple ...@@ -23,19 +23,22 @@ build-type: Simple
extra-source-files: CHANGELOG.md extra-source-files: CHANGELOG.md
library library
exposed-modules: Text.TEIWA exposed-modules: Data.Map.Lazy.Extra
, Text.TEIWA.Annotation.Data , Data.Tuple.Extra
other-modules: Data.Map.Lazy.Extra , Text.TEIWA
, Text.TEIWA.Annotation
, Text.TEIWA.Annotation.Context , Text.TEIWA.Annotation.Context
, Text.TEIWA.Annotation.Data
, Text.TEIWA.Source.Common
, Text.TEIWA.Source.WebAnno
, Text.TEIWA.Source.WebAnno.Annotator
, Text.TEIWA.Source.WebAnno.Data
other-modules: Text.TEIWA.Annotation
, Text.TEIWA.Annotation.Editor , Text.TEIWA.Annotation.Editor
, Text.TEIWA.Config , Text.TEIWA.Config
, Text.TEIWA.Error , Text.TEIWA.Error
, Text.TEIWA.Source , Text.TEIWA.Source
, Text.TEIWA.Source.Common
, Text.TEIWA.Source.CoNLLX , Text.TEIWA.Source.CoNLLX
, Text.TEIWA.Source.SSV , Text.TEIWA.Source.SSV
, Text.TEIWA.Source.WebAnno
build-depends: base >=4.12 && <4.15 build-depends: base >=4.12 && <4.15
, bytestring , bytestring
, containers , containers
...@@ -66,10 +69,30 @@ test-suite regression ...@@ -66,10 +69,30 @@ test-suite regression
type: detailed-0.9 type: detailed-0.9
test-module: Regression test-module: Regression
other-modules: Mock.Annotation other-modules: Mock.Annotation
, Sources , Regression.Text.TEIWA.Source
, Utils
build-depends: base
, Cabal
, mtl
, teiwa
, text
hs-source-dirs: test
ghc-options: -Wall
default-language: Haskell2010
test-suite unit
type: detailed-0.9
test-module: Unit
other-modules: Mock.Map
, Mock.Text.TEIWA
, Mock.WebAnno
, Unit.Data.Map.Lazy.Extra
, Unit.Text.TEIWA.Source.WebAnno.Annotator
, Unit.Text.TEIWA.Annotation.Context
, Utils , Utils
build-depends: base build-depends: base
, Cabal , Cabal
, containers
, mtl , mtl
, teiwa , teiwa
, text , text
......
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Mock.Annotation ( module Mock.Annotation (
coNLLX coNLLX
, node
, ssv , ssv
, webAnno , webAnno
) where ) where
...@@ -95,6 +96,41 @@ ssv = Annotations $ zipWith (node ["LEMMA", "POS"]) (concat sentences) [ ...@@ -95,6 +96,41 @@ ssv = Annotations $ zipWith (node ["LEMMA", "POS"]) (concat sentences) [
, [".", "PUNCT"] , [".", "PUNCT"]
] ]
webAnno :: Annotation
webAnno = Annotations $
(zipWith mkNode (sentences !! 0) [
["1-1", "0-3", "you", "PRON"]
, ["1-2", "4-7", "can", "AUX"]
, ["1-3", "8-12", "talk", "VERB"]
, ["1-4", "13-15", "to", "ADP"]
, ["1-5", "16-18", "I", "PRON"]
, ["1-6", "18-19", ".", "PUNCT"]
])
++ [
syntax "CONDITION" [
mkNode (sentences !! 1 !! 0) ["2-1", "0-2", "if", "SCONJ"]
, syntax "HYPOTHESIS" $
zipWith mkNode (take 3 $ drop 1 (sentences !! 1)) [
["2-2", "3-6", "you", "PRON"]
, ["2-3", "6-9", "be", "AUX"]
, ["2-4", "10-16", "lonely", "ADJ"]
]
, syntax "CONCLUSION" $
zipWith mkNode (take 5 $ drop 4 (sentences !! 1)) [
["2-5", "17-20", "you", "PRON"]
, ["2-6", "21-24", "can", "AUX"]
, ["2-7", "25-29", "talk", "VERB"]
, ["2-8", "30-32", "to", "ADP"]
, ["2-9", "32-34", "I", "PRON"]
]
]
, mkNode (sentences !! 1 !! 9) ["2-10", "34-35", ".", "PUNCT"]
]
where
mkNode = node ["ID", "SPAN", "LEMMA", "POS"]
syntax s = Node (Tag "rs" [("SYNTAX", s)]) . Annotations
{-
webAnno :: Annotation webAnno :: Annotation
webAnno = Annotations $ webAnno = Annotations $
sentence <$> sentence <$>
...@@ -121,3 +157,4 @@ webAnno = Annotations $ ...@@ -121,3 +157,4 @@ webAnno = Annotations $
) )
where where
mkNode = node ["SPAN", "LEMMA", "POS"] mkNode = node ["SPAN", "LEMMA", "POS"]
-}
module Mock.Map (
frenchDigits
, h2G2
) where
import Data.Map.Lazy as Map (Map, fromList)
h2G2 :: Map Int String
h2G2 = Map.fromList [(42, "answer")]
frenchDigits :: Map Int String
frenchDigits = Map.fromList [
(0, "zero")
, (1, "un")
, (2, "deux")
, (3, "trois")
, (4, "quatre")
, (5, "cinq")
, (6, "six")
, (7, "sept")
, (8, "huit")
, (9, "neuf")
]
{-# LANGUAGE OverloadedStrings #-}
module Mock.Text.TEIWA (
customHeaderConfig
, defaultHeader
, newFormConfig
, otherFormConfig
, otherHeader
) where
import Text.TEIWA (Config(..), defaultConfig)
import Text.TEIWA.Source.Common (Header)
defaultHeader :: Header
defaultHeader = ["form", "lemma", "pos"]
otherHeader :: Header
otherHeader = ["Test", "lemma", "pos"]
newFormConfig :: Config
newFormConfig = defaultConfig {
formColumn = Just "Test"
}
otherFormConfig :: Config
otherFormConfig = defaultConfig {
formColumn = Just "lemma"
}
customHeaderConfig :: Config
customHeaderConfig = defaultConfig {
formColumn = Just "Test"
, headerOverride = Just otherHeader
}
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module Mock.WebAnno (
anotherGreen
, blue
, context
, dbStep0
, dbStep1
, dbStep2
, dbStep3
, emptyDB
, green
, greenUC
, linesDB
, multiRowStep0
, multiRowStep1
, multiRowStep2
, multiRowStep3
, numberedRows
, red
) where
import Data.Map as Map (Map, empty, fromList, restrictKeys)
import Data.Set as Set (fromList)
import Data.Text.Lazy (Text)
import Text.TEIWA.Annotation.Context (Context(..))
import Text.TEIWA.Source.Common (Field, Numbered, Range(..))
import Text.TEIWA.Source.WebAnno.Data (
AnnotationDB(..), AnnotationValue(..), Row(..), UniqueCoordinate(..)
)
context :: Context
context = Context {
columnIndex = 2
, columnName = "FORM"
, header = ["ID", "SPAN", "LEMMA", "enc_tags"]
, tagger = \_ _ -> "w"
}
green :: AnnotationValue
green = AnnotationValue {labelTag = "GREEN", labelId = Just 0}
anotherGreen :: AnnotationValue
anotherGreen = AnnotationValue {labelTag = "GREEN", labelId = Just 1}
blue :: AnnotationValue
blue = AnnotationValue {labelTag = "BLUE", labelId = Nothing}
red :: AnnotationValue
red = AnnotationValue {labelTag = "RED", labelId = Nothing}
lemma :: Text -> AnnotationValue
lemma labelTag = AnnotationValue {labelTag, labelId = Nothing}
greenUC :: Int -> UniqueCoordinate
greenUC = UniqueCoordinate "enc_tags" "GREEN"
multiRowStep0 :: Map UniqueCoordinate Range
multiRowStep0 = Map.empty
multiRowStep1 :: Map UniqueCoordinate Range
multiRowStep1 = Map.fromList [(greenUC 0, Range 7 7)]
multiRowStep2 :: Map UniqueCoordinate Range
multiRowStep2 = Map.fromList [(greenUC 0, Range 7 8)]
multiRowStep3 :: Map UniqueCoordinate Range
multiRowStep3 = Map.fromList [(greenUC 0, Range 7 8), (greenUC 1, Range 9 9)]
numberedRows :: [Numbered Row]
numberedRows = [
(6, Row {
rowId = Range 1 1
, charSpan = Range 0 4
, form = "This"
, fields = [[lemma "this"], [blue]]
})
, (7, Row {
rowId = Range 1 2
, charSpan = Range 5 7
, form = "is"
, fields = [[lemma "be"], [green]]
})
, (8, Row {
rowId = Range 1 3
, charSpan = Range 10 11
, form = "a"
, fields = [[lemma "a"], [green, blue]]
})
, (9, Row {
rowId = Range 1 4
, charSpan = Range 16 20
, form = "test"
, fields = [[lemma "test"], []]
})
]
linesDB :: Int -> Int -> Map Int [Field]
linesDB a b = Map.restrictKeys numberedFields $ Set.fromList [a .. b]
where
numberedFields = Map.fromList [
(6, [Just "1-1", Just "0-4", Just "This", Just "this", Just "BLUE"])
, (7, [Just "1-2", Just "5-7", Just "is", Just "be", Nothing])
, (8, [Just "1-3", Just "10-11", Just "a", Just "a", Just "BLUE"])
, (9, [Just "1-4", Just "16-20", Just "test", Just "test", Nothing])
]
emptyDB :: AnnotationDB
emptyDB = AnnotationDB Map.empty Map.empty
dbStep0 :: AnnotationDB
dbStep0 = emptyDB {lineFields = linesDB 6 6}
dbStep1 :: AnnotationDB
dbStep1 = AnnotationDB {multiRows = multiRowStep1, lineFields = linesDB 6 7}
dbStep2 :: AnnotationDB
dbStep2 = AnnotationDB {multiRows = multiRowStep2, lineFields = linesDB 6 8}
dbStep3 :: AnnotationDB
dbStep3 = dbStep2 {lineFields = linesDB 6 9}
...@@ -3,7 +3,7 @@ module Regression ( ...@@ -3,7 +3,7 @@ module Regression (
) where ) where
import Distribution.TestSuite (Test) import Distribution.TestSuite (Test)
import Sources (parsing) import Regression.Text.TEIWA.Source as Sources (parsing)
tests :: IO [Test] tests :: IO [Test]
......
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
module Sources ( module Regression.Text.TEIWA.Source (
parsing parsing
) where ) where
import Control.Monad.Except (runExceptT) import Control.Monad.Except (runExceptT)
import Distribution.TestSuite (Progress(..), Test(..), Result(..)) import Distribution.TestSuite (Progress(..), Test(..), Result(..))
import qualified Mock.Annotation as Annotation (coNLLX, ssv, webAnno) import qualified Mock.Annotation as Annotation (coNLLX, ssv, webAnno)
import Text.Printf (printf)
import Text.TEIWA ( import Text.TEIWA (
Format, Origin(..), Source(..), coNLLX, csv, defaultConfig, parse, tsv Format, Origin(..), Source(..), coNLLX, csv, defaultConfig, parse, tsv
, webAnno , webAnno
) )
import Text.TEIWA.Annotation.Data (Annotation) import Text.TEIWA.Annotation.Data (Annotation)
import Utils (simpleTest) import Utils (diff, simpleTest)
data TestCase = TestCase { data TestCase = TestCase {
label :: String label :: String
...@@ -37,11 +36,9 @@ testSource (TestCase {label, caseFormat, file, expected}) = simpleTest label $ d ...@@ -37,11 +36,9 @@ testSource (TestCase {label, caseFormat, file, expected}) = simpleTest label $ d
case actual of case actual of
Left reason -> Fail $ show reason Left reason -> Fail $ show reason
Right annotation -> Right annotation ->
if annotation == expected then Pass else Fail $ diff annotation if annotation == expected
where then Pass
expectedS = show expected else Fail $ diff annotation expected
diff a =
printf "Result differs from expectations: %s vs %s" (show a) expectedS
parsing :: Test parsing :: Test
parsing = Group { parsing = Group {
......
module Unit (
tests
) where
import Distribution.TestSuite (Test)
import Unit.Data.Map.Lazy.Extra (mapTests)
import Unit.Text.TEIWA.Annotation.Context (ofHeaderTest)
import Unit.Text.TEIWA.Source.WebAnno.Annotator (annotationsTree)
tests :: IO [Test]
tests = pure [annotationsTree, mapTests, ofHeaderTest]
module Unit.Data.Map.Lazy.Extra (
mapTests
) where
import Data.Map.Lazy as Map (empty, filterWithKey, restrictKeys)
import Data.Map.Lazy.Extra (maxFor, splitOn)
import Data.Set as Set (fromList)
import Distribution.TestSuite (Test(..))
import Mock.Map (frenchDigits, h2G2)
import Utils (pureTest)
maxForTest :: Test
maxForTest = Group {
groupName = "maxSpan"
, concurrently = True
, groupTests = pureTest (uncurry maxFor) <$> [
("empty map", (byKey, Map.empty), Nothing)
, ("singleton", (byKey, h2G2), Just (42, "answer", 42))
, ("byKey", (byKey, frenchDigits), Just (9, "neuf", 9))
, ("byLength", (byLength, frenchDigits), Just (4, "quatre", 6))
, ("odd byLength",
(byLength, Map.filterWithKey (\k _ -> odd k) frenchDigits),
Just (3, "trois", 5))
]
}
where
byKey k _ = k
byLength _ = length
splitOnTest :: Test
splitOnTest = Group {
groupName = "splitOn"
, concurrently = True
, groupTests = pureTest (uncurry splitOn) <$> [
("empty map", ((0, 0), Map.empty), (Map.empty, Map.empty, Map.empty))
, ("singleton (lower)", ((7, 10), h2G2), (Map.empty, Map.empty, h2G2))
, ("singleton (within)", ((40, 50), h2G2), (Map.empty, h2G2, Map.empty))
, ("singleton (just on it)", ((42, 42), h2G2), (Map.empty, h2G2, Map.empty))
, ("singleton (higher)", ((50, 50), h2G2), (h2G2, Map.empty, Map.empty))
, ("some range", ((3, 5), frenchDigits), (sub 0 2, sub 3 5, sub 6 9))
, ("single point", ((4, 4), frenchDigits), (sub 0 3, sub 4 4, sub 5 9))
, ("outside before", ((-3, 6), frenchDigits), (Map.empty, sub 0 6, sub 7 9))
, ("outside after", ((8, 14), frenchDigits), (sub 0 7, sub 8 9, Map.empty))
, ("reverse bounds", ((8, 5), frenchDigits), (sub 0 7, Map.empty, sub 8 9))
]
}
where
sub a b = Map.restrictKeys frenchDigits $ Set.fromList [a .. b]
mapTests :: Test
mapTests = Group {
groupName = "mapTests"
, concurrently = True
, groupTests = [maxForTest, splitOnTest]
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment