Finish implementing the annotation-building process for WebAnno + add a lot of...

Finish implementing the annotation-building process for WebAnno + add a lot of unit and regression tests

Finish implementing the annotation-building process for WebAnno + add a lot of...
Finish implementing the annotation-building process for WebAnno + add a lot of unit and regression tests
f8a0f4b1 · Alice Brenon · d62768cd · f8a0f4b1 · f8a0f4b1 · f8a0f4b1
Commit f8a0f4b1 authored 3 years ago by Alice Brenon
--- a/lib/Data/Map/Lazy/Extra.hs
+++ b/lib/Data/Map/Lazy/Extra.hs
 module Data.Map.Lazy.Extra (
-    maxFor
+      foldMWithKey
+    , maxFor
+    , splitOn
  ) where
-import Data.Map as Map (Map, foldrWithKey)
+import Control.Monad.State (evalState, get, state)
+import Data.Map as Map (Map, foldlWithKey', foldrWithKey, spanAntitone)
-maxFor :: Ord b => (k -> a -> b) -> Map k a -> Maybe (k, a)
+-- | Evaluate a function on each entry contained in a map to find its maximum.
-maxFor weight = fmap fst . foldrWithKey (\k a -> Just . keepMax k a) Nothing
+-- It returns not only the key and input value but also the maximum reached
+maxFor :: Ord b => (k -> a -> b) -> Map k a -> Maybe (k, a, b)
+maxFor weight = foldrWithKey (\k a -> Just . keepMax k a) Nothing
  where
-    keepMax k a Nothing = ((k, a), weight k a)
+    keepMax k a Nothing = (k, a, weight k a)
-    keepMax k a (Just current@(_, w)) =
+    keepMax k a (Just current@(_, _, w)) =
      let w' = weight k a in
-      if w' > w then ((k, a), w') else current
+      if w' > w then (k, a, w') else current
+foldMWithKey :: Monad m => (b -> k -> a -> m b) -> b -> Map k a -> m b
+foldMWithKey f = foldlWithKey' (\m k v -> m >>= (\a -> f a k v)) . pure
+-- | Given two keys '(from, to)', cuts a map in three:
+-- + the submap containing the keys lower than 'from' and their values
+-- + the submap of keys between 'from' and 'to' (inclusive)
+-- + the submap of keys strictly greater than 'to'
+splitOn :: Ord k => (k, k) -> Map k a -> (Map k a, Map k a, Map k a)
+splitOn (from, to) = evalState ((,,) <$> pop (< from) <*> pop (<= to) <*> get)
+  where
+    pop = state . spanAntitone
--- a/lib/Data/Tuple/Extra.hs
+++ b/lib/Data/Tuple/Extra.hs
+module Data.Tuple.Extra (
+      onEach
+    , zipTriple
+  ) where
+zipTriple :: (a -> b -> c) -> (a, a, a) -> (b, b, b) -> (c, c, c)
+zipTriple f (a0, b0, c0) (a1, b1, c1) = (f a0 a1, f b0 b1, f c0 c1)
+onEach :: Applicative f => (a -> f b) -> (a, a, a) -> f (b, b, b)
+onEach f (a, b, c) = (,,) <$> f a <*> f b <*> f c
--- a/lib/Text/TEIWA/Annotation.hs
+++ b/lib/Text/TEIWA/Annotation.hs
@@ -19,6 +19,8 @@ apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
 apply config annotation =
  fmap snd . evalRWST (annotateWith annotation) config . start
+-- | TEI-C defines a Lightweight Linguistic Annotation framework
+-- https://tei-c.org/release/doc/tei-p5-doc/en/html/AI.html#AILALW
 isTEI :: Text -> Bool
 isTEI = (`elem` ["pos", "lemma", "msd"])

--- a/lib/Text/TEIWA/Annotation/Data.hs
+++ b/lib/Text/TEIWA/Annotation/Data.hs
-{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
 module Text.TEIWA.Annotation.Data (
      Annotation(..)
    , Format
@@ -9,7 +9,7 @@ module Text.TEIWA.Annotation.Data (
  ) where
 import Data.Text.Lazy as Text (Text, concat, pack)
-import Text.TEIWA.Source.Common (Attributes, TEIWAParser, attribute)
+import Text.TEIWA.Source.Common (Attributes, TEIWAParser)
 import Text.Printf (printf)
 data Tag = Tag {
@@ -29,3 +29,6 @@ openTag (Tag {name, annotated}) =
 closeTag :: Tag -> Text
 closeTag = Text.pack . printf "</%s>" . name
+attribute :: (Text, Text) -> Text
+attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
--- a/lib/Text/TEIWA/Error.hs
+++ b/lib/Text/TEIWA/Error.hs
@@ -13,6 +13,8 @@ data Error =
  | ParsingError ParseError
  | TermNotFound String
  | OverlappingAnnotation (Line, String) (Line, String)
+  | Debug String
+  deriving (Eq)
 instance Show Error where
  show (NoSuchColumn s) =
@@ -25,3 +27,5 @@ instance Show Error where
  show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
  show (OverlappingAnnotation (l1, s1) (l2, s2)) =
    printf "Annotations \"%s\" (l.%d) and \"%s\" (l.%d) overlap" s1 l1 s2 l2
+  show (Debug s) =
+    printf "DEBUG: %s" s
--- a/lib/Text/TEIWA/Source.hs
+++ b/lib/Text/TEIWA/Source.hs
-{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
+{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings, ScopedTypeVariables #-}
 module Text.TEIWA.Source (
      Format
    , Origin(..)

--- a/lib/Text/TEIWA/Source/CoNLLX.hs
+++ b/lib/Text/TEIWA/Source/CoNLLX.hs
-{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}
+{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
 module Text.TEIWA.Source.CoNLLX (
    coNLLX
  ) where
 import Control.Applicative ((<|>), many)
-import Data.Text.Lazy as Text (concat, pack)
+import Data.Text.Lazy as Text (Text, concat)
 import Text.Parsec (ParsecT, Stream, char, string)
 import Text.TEIWA.Annotation (tagToken, teiSentence)
 import Text.TEIWA.Annotation.Data (Annotation(..), Format)
 import Text.TEIWA.Annotation.Context as Context (ofHeader)
-import Text.TEIWA.Source.Common (Field, Header, avoid, eol, int, sentences)
+import Text.TEIWA.Source.Common (
+    Field, Header, Range(..), avoid, eol, int, range, sentences, toField
+  )
 header :: Header
 header = [
@@ -25,34 +27,29 @@ header = [
    , "PDEPREL"
  ]
-field :: Stream s m Char => ParsecT s u m Field
+fields :: Stream s m Char => ParsecT s u m [Field]
-field = build <$> avoid "\t\n\r"
+fields = many (char '\t' *> (build <$> avoid "\t\n\r")) <* eol
  where
    build "_" = Nothing
    build other = Just other
-type Range = (Int, Int)
+combineSubs :: [Maybe Text] -> [Maybe Text] -> [Maybe Text]
+combineSubs = zipWith combineFields
-rowID :: Stream s m Char => ParsecT s u m (Either Range Int)
-rowID = int >>= \n -> ((char '-' *> (Left . (,) n <$> int)) <|> pure (Right n))
-row :: Stream s m Char => ParsecT s u m [Field]
-row = rowID >>= either range singleLine
  where
-    toText = Text.pack . show
-    rest = many (char '\t' *> field) <* eol
-    singleLine n = (Just (toText n):) <$> rest
-    range (from, to) =
-      let rangeID = Text.concat [toText from, "-", toText to] in do
-      main <- rest
-      subs <- mapM (\k -> string (show k) *> rest) [from .. to]
-      return $ (Just rangeID):(zipWith (<|>) main $ foldl1 combineSubs subs)
-    combineSubs = zipWith combineFields
    combineFields Nothing Nothing = Nothing
    combineFields Nothing f@(Just _) = f
    combineFields f@(Just _) Nothing = f
    combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
+row :: Stream s m Char => ParsecT s u m [Field]
+row = (range >>= rangeLines) <|> (int >>= singleLine)
+  where
+    singleLine n = (toField n:) <$> fields
+    rangeLines r@(Range {from, to}) = do
+      main <- fields
+      subs <- mapM (\k -> string (show k) *> fields) [from .. to]
+      pure $ (toField r):(zipWith (<|>) main $ foldl1 combineSubs subs)
 coNLLX :: Format
 coNLLX = do
  context <- Context.ofHeader header

--- a/lib/Text/TEIWA/Source/Common.hs
+++ b/lib/Text/TEIWA/Source/Common.hs
-{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}
+{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
 module Text.TEIWA.Source.Common (
      Attributes
    , Field
    , Header
    , Numbered
+    , Range(..)
    , TEIWAParser
-    , attribute
    , avoid
    , eol
+    , hull
    , int
+    , range
    , recordLine
    , sentences
+    , size
    , teiTagger
+    , toField
+    , within
  ) where
 import Control.Applicative ((<|>), many, optional)
 import Control.Monad.Reader (ReaderT)
 import Data.Char (isPunctuation)
-import Data.Text.Lazy as Text (Text, concat, head, length, pack)
+import Data.Text.Lazy as Text (Text, head, length, pack)
 import Text.Parsec (
      Line, ParsecT, Stream, char, digit, endOfLine, getParserState, many1
    , noneOf, sepEndBy, sourceLine, statePos, try
@@ -27,6 +32,24 @@ import Text.TEIWA.Error (Error(..))
 type Field = Maybe Text
 type Header = [Text]
+data Range = Range {
+      from :: Int
+    , to :: Int
+  } deriving (Eq)
+instance Show Range where
+  show (Range {from, to}) = show from ++ "-" ++ show to
+within :: Int -> Range -> Bool
+within i (Range {from, to}) = from <= i && i <= to
+hull :: Range -> Range -> Range
+hull (Range newMin newMax) (Range oldMin oldMax) =
+  Range (min newMin oldMin) (max newMax oldMax)
+size :: Range -> Int
+size (Range {from, to}) = to - from
 type Numbered a = (Line, a)
 type Attributes = [(Text, Text)]
@@ -47,9 +70,6 @@ currentLine = sourceLine . statePos <$> getParserState
 recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Numbered a)
 recordLine p = (,) <$> currentLine <*> p
-attribute :: (Text, Text) -> Text
-attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
 sentence :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [Numbered p]
 sentence row = many comment *> many1 (recordLine row)
  where
@@ -57,9 +77,15 @@ sentence row = many comment *> many1 (recordLine row)
 sentences :: Stream s m Char => ParsecT s u m p -> ParsecT s u m [[Numbered p]]
 sentences row = sentence row `sepEndBy` many1 eol
--sentences row = many (many1 eol *> sentence row)
+range :: Stream s m Char => ParsecT s u m Range
+range = try (Range <$> int <* char '-' <*> int)
+toField :: Show a => a -> Field
+toField = Just . Text.pack . show
 teiTagger :: Text -> Attributes -> Text
 teiTagger t _
  | Text.length t == 1 && isPunctuation (Text.head t) = "pc"
+  | Text.length t == 0 = "rs"
  | otherwise = "w"
--- a/lib/Text/TEIWA/Source/WebAnno.hs
+++ b/lib/Text/TEIWA/Source/WebAnno.hs
 {-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
-module Text.TEIWA.Source.WebAnno {-(
+module Text.TEIWA.Source.WebAnno (
    webAnno
-  )-} where
+  ) where
 import Control.Applicative ((<|>), many, optional)
-import Control.Monad.Except (MonadError)
-import Data.Either (partitionEithers)
-import Data.Map as Map (Map, empty, insertWith)
-import Data.Map.Lazy.Extra as Map (maxFor)
-import Data.List (foldl')
 import Data.Text.Lazy as Text (Text, pack)
 import Text.Parsec (
-      Line, ParsecT, Stream, anyChar, char, count, many1, noneOf, sepBy1, skipMany1
+    ParsecT, Stream, anyChar, char, many1, noneOf, sepBy1, string
-    , string
-  )
-import Text.TEIWA.Annotation (tagToken, teiSentence)
-import Text.TEIWA.Annotation.Context (Context)
-import qualified Text.TEIWA.Annotation.Context as Context (Context(..), ofHeader)
-import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..))
-import Text.TEIWA.Error (Error)
-import Text.TEIWA.Source.Common (
-    Field, Header, Numbered, TEIWAParser, avoid, eol, int, recordLine, sentences
  )
+import Text.TEIWA.Annotation.Context as Context (ofHeader)
+import Text.TEIWA.Annotation.Data (Format)
+import Text.TEIWA.Source.Common (Header, avoid, eol, int, range, sentences)
+import Text.TEIWA.Source.WebAnno.Annotator (annotate)
+import Text.TEIWA.Source.WebAnno.Data (AnnotationValue(..), Row(..))
 pragma :: Stream s m Char => ParsecT s u m Text
 pragma = char '#' *> avoid "=" <* char '='
@@ -29,8 +20,7 @@ pragma = char '#' *> avoid "=" <* char '='
 header :: Stream s m Char => ParsecT s u m Header
 header = do
  pragma *> string "WebAnno TSV " *> int `sepBy1` char '.' *> eol
-  --concat <$> (pragma >>= columns) `sepBy1` eol
+  concat . (["ID", "SPAN", "FORM"]:) <$> many ((pragma >>= columns) <* eol)
-  ("FORM":) . concat <$> many ((pragma >>= columns) <* eol)
  where
    columns "T_SP" = drop 1 <$> avoid "|\r\n" `sepBy1` char '|'
    columns _ = avoid "\r\n" *> pure []
@@ -38,57 +28,23 @@ header = do
 reserved :: [Char]
 reserved = "\\[]|_-;\t\n*"
-type TokenAnnotation = (Text, Int)
+field :: Stream s m Char => ParsecT s u m [AnnotationValue]
-field :: Stream s m Char => ParsecT s u m [TokenAnnotation]
 field = noValue <|> component `sepBy1` (char '|')
  where
    noValue = char '_' *> pure []
-    uniqueID = char '[' *> int <* char ']'
+    bracketedID = char '[' *> int <* char ']'
-    component = (,)
+    component = AnnotationValue
      <$> (Text.pack <$> many1 (noneOf reserved <|> (char '\\' *> anyChar)))
-      <*> (maybe 0 id <$> optional uniqueID)
+      <*> (optional bracketedID)
-data Row = Row {
-      form :: Text
-    , fields :: [[TokenAnnotation]]
-  }
 row :: Stream s m Char => ParsecT s u m Row
-row = makeRow <$> count 3 column <*> many1 (field <* char '\t') <* eol
+row = Row
-  where
+  <$> (range <* char '\t')
-    column = avoid "\t\n" <* char '\t'
+  <*> (range <* char '\t')
-    makeRow columns fields = Row {form = columns !! 2, fields}
+  <*> (avoid "\t\n" <* char '\t')
+  <*> many1 (field <* char '\t') <* eol
-getSpans :: Context -> [Numbered Row] -> Map (Text, TokenAnnotation) (Line, Line)
-getSpans context numbered =
-  foldl' (\tmpMap (lineNumber, row) ->
-      foldl' (\tmpMap' (hK, labels) ->
-          foldr (\label ->
-              Map.insertWith extendSpan (hK, label) (lineNumber, lineNumber)
-            ) tmpMap' labels
-        ) tmpMap row
-    ) Map.empty byHeader
-  where
-    byHeader = fmap (zip (Context.header context) . fields) <$> numbered
-    extendSpan (_, newMax) (oldMin, _) = (oldMin, newMax)
-extractNodes :: MonadError Error m => Context -> [Numbered Row] -> m [Node]
-extractNodes _ [] = pure []
-extractNodes context numbered = undefined
-  where
-    spans = getSpans context numbered
-    maximalSpan = maxFor (\_ (minL, maxL) -> maxL - minL)
-{-
-extractNodes context ((lineNumber, Row {fields, spans = []}):others) =
-  tagToken (lineNumber, fields) : extractNodes context others
-extractNode context ((lineNumber, Row {fields, spans}):others) = undefined
-}
 webAnno :: Format
 webAnno = do
  context <- Context.ofHeader =<< header
-  Annotations . fmap teiSentence <$> (
+  (many eol *> sentences row) >>= annotate context
-      (many eol *> sentences row) >>= mapM (extractNodes context)
-    )
--- a/lib/Text/TEIWA/Source/WebAnno/Annotator.hs
+++ b/lib/Text/TEIWA/Source/WebAnno/Annotator.hs
+{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
+module Text.TEIWA.Source.WebAnno.Annotator (
+      AnnotationDB(..)
+    , UniqueCoordinate(..)
+    , annotate
+    , buildTree
+    , index
+    , indexCell
+    , splitDB
+  ) where
+import Control.Monad.Except (MonadError(..))
+import Control.Monad.RWS (MonadReader)
+import Data.Map as Map (Map, delete, empty, insert, insertWith, toList)
+import Data.Map.Lazy.Extra as Map (foldMWithKey, maxFor, splitOn)
+import Data.List as List (foldl')
+import Data.Text.Lazy as Text (Text, intercalate)
+import Data.Tuple.Extra (onEach, zipTriple)
+import Text.Parsec (Line)
+import Text.TEIWA.Annotation (tagToken, teiSentence)
+import Text.TEIWA.Annotation.Context (Context)
+import qualified Text.TEIWA.Annotation.Context as Context (Context(..))
+import Text.TEIWA.Annotation.Data (Annotation(..), Node(..), Tag(..))
+import Text.TEIWA.Config (Config)
+import Text.TEIWA.Error (Error(..))
+import Text.TEIWA.Source.Common (
+    Field, Numbered, Range(..), hull , size, toField
+  )
+import Text.TEIWA.Source.WebAnno.Data (
+    AnnotationDB(..), AnnotationValue(..), Row(..), UniqueCoordinate(..)
+  )
+-- | Each cell in a WebAnno file may contain several 'AnnotationValue's which
+-- are indexed separately:
+-- + the multiline ones, with an ID, get pushed to the 'Range' map
+-- + the others are simply joined by a "+" and become a 'Field' in the row
+indexCell :: Line -> (Map UniqueCoordinate Range, [Field]) -> (Text, [AnnotationValue])
+  -> (Map UniqueCoordinate Range, [Field])
+indexCell line (db, lineFields) (tagName, annotationValues) =
+  let (newDB, tags) = foldr switch (db, []) annotationValues in
+  (newDB, (fieldOfTags tags):lineFields)
+  where
+    switch (AnnotationValue newTag Nothing) (tmpDB, tmpTags) =
+      (tmpDB, newTag:tmpTags)
+    switch (AnnotationValue value (Just uniqueID)) (tmpDB, tmpTags) =
+      let uniqueCoords = UniqueCoordinate {tagName, value, uniqueID} in
+      (Map.insertWith hull uniqueCoords (Range line line) tmpDB, tmpTags)
+    fieldOfTags [] = Nothing
+    fieldOfTags tags = Just $ Text.intercalate "+" tags
+-- | Given a 'Context' c to provide the header fields, we can now index a
+-- '[[Numbered Rows]]' into an 'AnnotationDB' by folding twice on the rows, and
+-- for each one build the corresponding line by folding the previous function on
+-- each cell
+index :: Context -> [[Numbered Row]] -> AnnotationDB
+index context = foldl' (foldl' indexLine) (AnnotationDB Map.empty Map.empty)
+  where
+    indexLine db (line, Row {rowId, charSpan, form, fields}) =
+      let cells = zip (drop 2 $ Context.header context) fields
+        -- FORM column will already have been removed by Context.ofHeader so we
+        -- need to drop only 2 fields (ID and SPAN, which are built-in for this
+        -- format)
+          tmpPair = (multiRows db, [Just form, toField charSpan, toField rowId])
+          (newMRs, newLine) = foldl' (indexCell line) tmpPair cells in
+      AnnotationDB newMRs (Map.insert line (reverse newLine) $ lineFields db)
+-- | As the annotation tree will be built recursively, we will need to split the
+-- DB at each step into the DBs of lines strictly before the maximal multiline
+-- annotation found, the lines contained within it and the lines strictly after
+-- it
+splitDB :: MonadError Error m => (UniqueCoordinate, Range) -> AnnotationDB ->
+  m (AnnotationDB, AnnotationDB, AnnotationDB)
+splitDB (maxCoords, Range {from, to}) (AnnotationDB {multiRows, lineFields}) =
+  zipTriple (flip AnnotationDB)
+    (splitOn (from, to) lineFields)
+    <$> (foldMWithKey classify emptyTriple $ Map.delete maxCoords multiRows)
+  where
+    classify (lower, within, greater) k r@(Range otherFrom otherTo)
+      | otherTo < from = pure (Map.insert k r lower, within, greater)
+      | to < otherFrom = pure (lower, within, Map.insert k r greater)
+      | from <= otherFrom && otherTo <= to =
+         pure (lower, Map.insert k r within, greater)
+      | otherwise = throwError $
+        OverlappingAnnotation (from, show maxCoords) (otherFrom, show k)
+    emptyTriple = (Map.empty, Map.empty, Map.empty)
+-- | Once the annotations database is built, an 'Annotation' tree may be
+-- retrieved by taking the multiline annotations by decreasing inclusion order,
+-- and outputting the simple rows for each interval the delimitate
+buildTree :: (MonadError Error m, MonadReader Config m) =>
+  Context -> AnnotationDB -> m [Node]
+buildTree context db@(AnnotationDB {multiRows, lineFields}) =
+  case maxFor (\_ -> size) multiRows of
+    Just (maxCoords@(UniqueCoordinate {tagName, value}), bounds, _) -> do
+      shards <- splitDB (maxCoords, bounds) db
+      (lower, within, greater) <- onEach (buildTree context) shards
+      let annotated = [(tagName, value)]
+      let name = (Context.tagger context) "" annotated
+      let n = Node (Tag {name, annotated}) (Annotations within)
+      pure (lower ++ (n:greater))
+    _ -> mapM (tagToken context) $ Map.toList lineFields
+annotate :: (MonadError Error m, MonadReader Config m) =>
+  Context -> [[Numbered Row]] -> m Annotation
+annotate context = fmap Annotations . buildTree context . index context
--- a/lib/Text/TEIWA/Source/WebAnno/Data.hs
+++ b/lib/Text/TEIWA/Source/WebAnno/Data.hs
+{-# LANGUAGE NamedFieldPuns #-}
+module Text.TEIWA.Source.WebAnno.Data (
+      AnnotationDB(..)
+    , AnnotationValue(..)
+    , Row(..)
+    , UniqueCoordinate(..)
+  ) where
+import Data.Map as Map (Map)
+import Data.Text.Lazy (Text, unpack)
+import Text.Parsec (Line)
+import Text.TEIWA.Source.Common (Field, Range)
+data AnnotationValue = AnnotationValue {
+      labelTag :: Text
+    , labelId :: Maybe Int
+  } deriving (Eq, Ord)
+instance Show AnnotationValue where
+  show (AnnotationValue {labelTag, labelId}) =
+    unpack labelTag ++ maybe "" show labelId
+data Row = Row {
+      rowId :: Range
+    , charSpan :: Range
+    , form :: Text
+    , fields :: [[AnnotationValue]]
+  } deriving (Eq, Show)
+-- | Since annotation tags can be found on several lines, they are distinguished
+-- | with their numeric ID written within brackets.
+-- | It entails that a pair containing a tag name and a value isn't unique, so
+-- | we need to define a type for unique coordinates:
+data UniqueCoordinate = UniqueCoordinate {
+      tagName :: Text
+    , value :: Text
+    , uniqueID :: Int
+  } deriving (Eq, Ord, Show)
+-- | We can now define a type representing the indexed data for an annotation,
+-- separating the annotations spanning on several rows from the single-row
+-- ones which are much similar to the types found in the other supported
+-- formats.
+data AnnotationDB = AnnotationDB {
+      multiRows :: Map UniqueCoordinate Range
+    , lineFields :: Map Line [Field]
+  } deriving (Eq, Show)
--- a/teiwa.cabal
+++ b/teiwa.cabal
@@ -23,19 +23,22 @@ build-type:          Simple
 extra-source-files:  CHANGELOG.md
 library
-  exposed-modules:     Text.TEIWA
+  exposed-modules:     Data.Map.Lazy.Extra
-                     , Text.TEIWA.Annotation.Data
+                     , Data.Tuple.Extra
-  other-modules:       Data.Map.Lazy.Extra
+                     , Text.TEIWA
-                     , Text.TEIWA.Annotation
                     , Text.TEIWA.Annotation.Context
+                     , Text.TEIWA.Annotation.Data
+                     , Text.TEIWA.Source.Common
+                     , Text.TEIWA.Source.WebAnno
+                     , Text.TEIWA.Source.WebAnno.Annotator
+                     , Text.TEIWA.Source.WebAnno.Data
+  other-modules:       Text.TEIWA.Annotation
                     , Text.TEIWA.Annotation.Editor
                     , Text.TEIWA.Config
                     , Text.TEIWA.Error
                     , Text.TEIWA.Source
-                     , Text.TEIWA.Source.Common
                     , Text.TEIWA.Source.CoNLLX
                     , Text.TEIWA.Source.SSV
-                     , Text.TEIWA.Source.WebAnno
  build-depends:       base >=4.12 && <4.15
                     , bytestring
                     , containers
@@ -66,10 +69,30 @@ test-suite regression
  type:                detailed-0.9
  test-module:         Regression
  other-modules:       Mock.Annotation
-                     , Sources
+                     , Regression.Text.TEIWA.Source
+                     , Utils
+  build-depends:       base
+                     , Cabal
+                     , mtl
+                     , teiwa
+                     , text
+  hs-source-dirs:      test
+  ghc-options:         -Wall
+  default-language:    Haskell2010
+test-suite unit
+  type:                detailed-0.9
+  test-module:         Unit
+  other-modules:       Mock.Map
+                     , Mock.Text.TEIWA
+                     , Mock.WebAnno
+                     , Unit.Data.Map.Lazy.Extra
+                     , Unit.Text.TEIWA.Source.WebAnno.Annotator
+                     , Unit.Text.TEIWA.Annotation.Context
                     , Utils
  build-depends:       base
                     , Cabal
+                     , containers
                     , mtl
                     , teiwa
                     , text

--- a/test/Mock/Annotation.hs
+++ b/test/Mock/Annotation.hs
 {-# LANGUAGE OverloadedStrings #-}
 module Mock.Annotation (
      coNLLX
+    , node
    , ssv
    , webAnno
  ) where
@@ -95,6 +96,41 @@ ssv = Annotations $ zipWith (node ["LEMMA", "POS"]) (concat sentences) [
    , [".", "PUNCT"]
  ]
+webAnno :: Annotation
+webAnno = Annotations $
+    (zipWith mkNode (sentences !! 0) [
+            ["1-1", "0-3", "you", "PRON"]
+          , ["1-2", "4-7", "can", "AUX"]
+          , ["1-3", "8-12", "talk", "VERB"]
+          , ["1-4", "13-15", "to", "ADP"]
+          , ["1-5", "16-18", "I", "PRON"]
+          , ["1-6", "18-19", ".", "PUNCT"]
+        ])
+  ++ [
+      syntax "CONDITION" [
+            mkNode (sentences !! 1 !! 0) ["2-1", "0-2", "if", "SCONJ"]
+          , syntax "HYPOTHESIS" $
+              zipWith mkNode (take 3 $ drop 1 (sentences !! 1)) [
+                    ["2-2", "3-6", "you", "PRON"]
+                  , ["2-3", "6-9", "be", "AUX"]
+                  , ["2-4", "10-16", "lonely", "ADJ"]
+                ]
+          , syntax "CONCLUSION" $
+              zipWith mkNode (take 5 $ drop 4 (sentences !! 1)) [
+                    ["2-5", "17-20", "you", "PRON"]
+                  , ["2-6", "21-24", "can", "AUX"]
+                  , ["2-7", "25-29", "talk", "VERB"]
+                  , ["2-8", "30-32", "to", "ADP"]
+                  , ["2-9", "32-34", "I", "PRON"]
+                ]
+        ]
+    , mkNode (sentences !! 1 !! 9) ["2-10", "34-35", ".", "PUNCT"]
+  ]
+  where
+    mkNode = node ["ID", "SPAN", "LEMMA", "POS"]
+    syntax s = Node (Tag "rs" [("SYNTAX", s)]) . Annotations
+{-
 webAnno :: Annotation
 webAnno = Annotations $
  sentence <$>
@@ -121,3 +157,4 @@ webAnno = Annotations $
      )
  where
    mkNode = node ["SPAN", "LEMMA", "POS"]
+-}
--- a/test/Mock/Map.hs
+++ b/test/Mock/Map.hs
+module Mock.Map (
+      frenchDigits
+    , h2G2
+  ) where
+import Data.Map.Lazy as Map (Map, fromList)
+h2G2 :: Map Int String
+h2G2 = Map.fromList [(42, "answer")]
+frenchDigits :: Map Int String
+frenchDigits = Map.fromList [
+      (0, "zero")
+    , (1, "un")
+    , (2, "deux")
+    , (3, "trois")
+    , (4, "quatre")
+    , (5, "cinq")
+    , (6, "six")
+    , (7, "sept")
+    , (8, "huit")
+    , (9, "neuf")
+  ]
--- a/test/Mock/Text/TEIWA.hs
+++ b/test/Mock/Text/TEIWA.hs
+{-# LANGUAGE OverloadedStrings #-}
+module Mock.Text.TEIWA (
+      customHeaderConfig
+    , defaultHeader
+    , newFormConfig
+    , otherFormConfig
+    , otherHeader
+  ) where
+import Text.TEIWA (Config(..), defaultConfig)
+import Text.TEIWA.Source.Common (Header)
+defaultHeader :: Header
+defaultHeader = ["form", "lemma", "pos"]
+otherHeader :: Header
+otherHeader = ["Test", "lemma", "pos"]
+newFormConfig :: Config
+newFormConfig = defaultConfig {
+    formColumn = Just "Test"
+  }
+otherFormConfig :: Config
+otherFormConfig = defaultConfig {
+    formColumn = Just "lemma"
+  }
+customHeaderConfig :: Config
+customHeaderConfig = defaultConfig {
+      formColumn = Just "Test"
+    , headerOverride = Just otherHeader
+  }
--- a/test/Mock/WebAnno.hs
+++ b/test/Mock/WebAnno.hs
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+module Mock.WebAnno (
+      anotherGreen
+    , blue
+    , context
+    , dbStep0
+    , dbStep1
+    , dbStep2
+    , dbStep3
+    , emptyDB
+    , green
+    , greenUC
+    , linesDB
+    , multiRowStep0
+    , multiRowStep1
+    , multiRowStep2
+    , multiRowStep3
+    , numberedRows
+    , red
+  ) where
+import Data.Map as Map (Map, empty, fromList, restrictKeys)
+import Data.Set as Set (fromList)
+import Data.Text.Lazy (Text)
+import Text.TEIWA.Annotation.Context (Context(..))
+import Text.TEIWA.Source.Common (Field, Numbered, Range(..))
+import Text.TEIWA.Source.WebAnno.Data (
+    AnnotationDB(..), AnnotationValue(..), Row(..), UniqueCoordinate(..)
+  )
+context :: Context
+context = Context {
+      columnIndex = 2
+    , columnName = "FORM"
+    , header = ["ID", "SPAN", "LEMMA", "enc_tags"]
+    , tagger = \_ _ -> "w"
+  }
+green :: AnnotationValue
+green = AnnotationValue {labelTag = "GREEN", labelId = Just 0}
+anotherGreen :: AnnotationValue
+anotherGreen = AnnotationValue {labelTag = "GREEN", labelId = Just 1}
+blue :: AnnotationValue
+blue = AnnotationValue {labelTag = "BLUE", labelId = Nothing}
+red :: AnnotationValue
+red = AnnotationValue {labelTag = "RED", labelId = Nothing}
+lemma :: Text -> AnnotationValue
+lemma labelTag = AnnotationValue {labelTag, labelId = Nothing}
+greenUC :: Int -> UniqueCoordinate
+greenUC = UniqueCoordinate "enc_tags" "GREEN"
+multiRowStep0 :: Map UniqueCoordinate Range
+multiRowStep0 = Map.empty
+multiRowStep1 :: Map UniqueCoordinate Range
+multiRowStep1 = Map.fromList [(greenUC 0, Range 7 7)]
+multiRowStep2 :: Map UniqueCoordinate Range
+multiRowStep2 = Map.fromList [(greenUC 0, Range 7 8)]
+multiRowStep3 :: Map UniqueCoordinate Range
+multiRowStep3 = Map.fromList [(greenUC 0, Range 7 8), (greenUC 1, Range 9 9)]
+numberedRows :: [Numbered Row]
+numberedRows = [
+      (6, Row {
+            rowId = Range 1 1
+          , charSpan = Range 0 4
+          , form = "This"
+          , fields = [[lemma "this"], [blue]]
+        })
+    , (7, Row {
+            rowId = Range 1 2
+          , charSpan = Range 5 7
+          , form = "is"
+          , fields = [[lemma "be"], [green]]
+        })
+    , (8, Row {
+            rowId = Range 1 3
+          , charSpan = Range 10 11
+          , form = "a"
+          , fields = [[lemma "a"], [green, blue]]
+        })
+    , (9, Row {
+            rowId = Range 1 4
+          , charSpan = Range 16 20
+          , form = "test"
+          , fields = [[lemma "test"], []]
+        })
+  ]
+linesDB :: Int -> Int -> Map Int [Field]
+linesDB a b = Map.restrictKeys numberedFields $ Set.fromList [a .. b]
+  where
+    numberedFields = Map.fromList [
+          (6, [Just "1-1", Just "0-4", Just "This", Just "this", Just "BLUE"])
+        , (7, [Just "1-2", Just "5-7", Just "is", Just "be", Nothing])
+        , (8, [Just "1-3", Just "10-11", Just "a", Just "a", Just "BLUE"])
+        , (9, [Just "1-4", Just "16-20", Just "test", Just "test", Nothing])
+      ]
+emptyDB :: AnnotationDB
+emptyDB = AnnotationDB Map.empty Map.empty
+dbStep0 :: AnnotationDB
+dbStep0 = emptyDB {lineFields = linesDB 6 6}
+dbStep1 :: AnnotationDB
+dbStep1 = AnnotationDB {multiRows = multiRowStep1, lineFields = linesDB 6 7}
+dbStep2 :: AnnotationDB
+dbStep2 = AnnotationDB {multiRows = multiRowStep2, lineFields = linesDB 6 8}
+dbStep3 :: AnnotationDB
+dbStep3 = dbStep2 {lineFields = linesDB 6 9}
--- a/test/Regression.hs
+++ b/test/Regression.hs
@@ -3,7 +3,7 @@ module Regression (
  ) where
 import Distribution.TestSuite (Test)
-import Sources (parsing)
+import Regression.Text.TEIWA.Source as Sources (parsing)
 tests :: IO [Test]

--- a/test/Sources.hs
+++ b/test/Sources.hs
 {-# LANGUAGE NamedFieldPuns #-}
-module Sources (
+module Regression.Text.TEIWA.Source (
    parsing
  ) where
 import Control.Monad.Except (runExceptT)
 import Distribution.TestSuite (Progress(..), Test(..), Result(..))
 import qualified Mock.Annotation as Annotation (coNLLX, ssv, webAnno)
-import Text.Printf (printf)
 import Text.TEIWA (
      Format, Origin(..), Source(..), coNLLX, csv, defaultConfig, parse, tsv
    , webAnno
  )
 import Text.TEIWA.Annotation.Data (Annotation)
-import Utils (simpleTest)
+import Utils (diff, simpleTest)
 data TestCase = TestCase {
      label :: String
@@ -37,11 +36,9 @@ testSource (TestCase {label, caseFormat, file, expected}) = simpleTest label $ d
    case actual of
      Left reason -> Fail $ show reason
      Right annotation ->
-        if annotation == expected then Pass else Fail $ diff annotation
+        if annotation == expected
-  where
+        then Pass
-    expectedS = show expected
+        else Fail $ diff annotation expected
-    diff a =
-      printf "Result differs from expectations: %s vs %s" (show a) expectedS
 parsing :: Test
 parsing = Group {

--- a/test/Unit.hs
+++ b/test/Unit.hs
+module Unit (
+    tests
+  ) where
+import Distribution.TestSuite (Test)
+import Unit.Data.Map.Lazy.Extra (mapTests)
+import Unit.Text.TEIWA.Annotation.Context (ofHeaderTest)
+import Unit.Text.TEIWA.Source.WebAnno.Annotator (annotationsTree)
+tests :: IO [Test]
+tests = pure [annotationsTree, mapTests, ofHeaderTest]
--- a/test/Unit/Data/Map/Lazy/Extra.hs
+++ b/test/Unit/Data/Map/Lazy/Extra.hs
+module Unit.Data.Map.Lazy.Extra (
+    mapTests
+  ) where
+import Data.Map.Lazy as Map (empty, filterWithKey, restrictKeys)
+import Data.Map.Lazy.Extra (maxFor, splitOn)
+import Data.Set as Set (fromList)
+import Distribution.TestSuite (Test(..))
+import Mock.Map (frenchDigits, h2G2)
+import Utils (pureTest)
+maxForTest :: Test
+maxForTest = Group {
+      groupName = "maxSpan"
+    , concurrently = True
+    , groupTests = pureTest (uncurry maxFor) <$> [
+            ("empty map", (byKey, Map.empty), Nothing)
+          , ("singleton", (byKey, h2G2), Just (42, "answer", 42))
+          , ("byKey", (byKey, frenchDigits), Just (9, "neuf", 9))
+          , ("byLength", (byLength, frenchDigits), Just (4, "quatre", 6))
+          , ("odd byLength",
+              (byLength, Map.filterWithKey (\k _ -> odd k) frenchDigits),
+              Just (3, "trois", 5))
+        ]
+  }
+  where
+    byKey k _ = k
+    byLength _ = length
+splitOnTest :: Test
+splitOnTest = Group {
+      groupName = "splitOn"
+    , concurrently = True
+    , groupTests = pureTest (uncurry splitOn) <$> [
+            ("empty map", ((0, 0), Map.empty), (Map.empty, Map.empty, Map.empty))
+          , ("singleton (lower)", ((7, 10), h2G2), (Map.empty, Map.empty, h2G2))
+          , ("singleton (within)", ((40, 50), h2G2), (Map.empty, h2G2, Map.empty))
+          , ("singleton (just on it)", ((42, 42), h2G2), (Map.empty, h2G2, Map.empty))
+          , ("singleton (higher)", ((50, 50), h2G2), (h2G2, Map.empty, Map.empty))
+          , ("some range", ((3, 5), frenchDigits), (sub 0 2, sub 3 5, sub 6 9))
+          , ("single point", ((4, 4), frenchDigits), (sub 0 3, sub 4 4, sub 5 9))
+          , ("outside before", ((-3, 6), frenchDigits), (Map.empty, sub 0 6, sub 7 9))
+          , ("outside after", ((8, 14), frenchDigits), (sub 0 7, sub 8 9, Map.empty))
+          , ("reverse bounds", ((8, 5), frenchDigits), (sub 0 7, Map.empty, sub 8 9))
+        ]
+  }
+  where
+    sub a b = Map.restrictKeys frenchDigits $ Set.fromList [a .. b]
+mapTests :: Test
+mapTests = Group {
+      groupName = "mapTests"
+    , concurrently = True
+    , groupTests = [maxForTest, splitOnTest]
+  }