Skip to content
Snippets Groups Projects
Commit 97c90f5e authored by Alice Brenon's avatar Alice Brenon
Browse files

Generalize Annotations to accomodate arbitrary tags (and not only <s/>, <w/>...

Generalize Annotations to accomodate arbitrary tags (and not only <s/>, <w/> and <pc/>) on arbitrarily nested structure
parent ca22f6fe
No related branches found
No related tags found
No related merge requests found
...@@ -4,76 +4,78 @@ ...@@ -4,76 +4,78 @@
module Text.TEIWA.Annotation ( module Text.TEIWA.Annotation (
Annotation(..) Annotation(..)
, Attributes , Attributes
, SentenceAnnotation(..) , Node(..)
, TokenAnnotation(..) , Tag(..)
, apply , apply
) where ) where
import Control.Monad.Except (MonadError(..)) import Control.Monad.Except (MonadError(..))
import Control.Monad.RWS (RWST, evalRWST, get, put, reader, tell) import Control.Monad.RWS (RWST, evalRWST, gets, modify, tell)
import Data.Text.Lazy as Text (Text, breakOn, concat, drop, head, length, unpack) import Data.Int (Int64)
import Data.Text.Lazy as Text (Text, breakOn, concat, drop, length, pack, unpack)
import Text.Printf (printf)
import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Error (Error(..))
type Attributes = [(Text, Text)] type Attributes = [(Text, Text)]
data TokenAnnotation = TokenAnnotation { data Tag = Tag {
form :: Text name :: Text
, annotated :: Attributes , annotated :: Attributes
} deriving Show } deriving Show
data Node = Node {
newtype SentenceAnnotation = SentenceAnnotation { tag :: Tag
getTokens :: [TokenAnnotation] , inside :: Annotation
} } deriving Show
data Annotation = Token Text | Annotations [Node] deriving Show
data Annotation =
SentenceLevel [SentenceAnnotation]
| TokenLevel [TokenAnnotation]
attribute :: (Text, Text) -> Text attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""] attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
token :: Text -> TokenAnnotation -> Text openTag :: Tag -> String
token elementName (TokenAnnotation {form, annotated}) = openTag (Tag {name, annotated}) =
Text.concat (openTag ++ (form:closeTag)) printf "<%s%s>" name . Text.concat $ attribute <$> annotated
where
openTag = ("<":elementName:(attribute <$> annotated) ++ [">"])
closeTag = ["</", elementName,">"]
type Editor m = RWST Config Text Text m () closeTag :: Tag -> String
closeTag = printf "</%s>" . name
pc :: TokenAnnotation -> Text data EditorState = EditorState {
pc = token "pc" input :: Text
, tagStack :: [Tag]
}
type Editor m = RWST Config Text EditorState m ()
w :: TokenAnnotation -> Text enter :: Monad m => Tag -> Editor m
w = token "w" enter t = modify $
\editorState -> editorState {tagStack = t:tagStack editorState}
check :: MonadError Error m => Maybe Text -> TokenAnnotation -> (Text, Text) -> Editor m flushTags :: Monad m => Editor m
check openingTag expected@(TokenAnnotation {form}) (before, focused) = flushTags = do
if Text.length focused >= expectedLength tags <- gets tagStack
then do modify $ \editorState -> editorState {tagStack = []}
tagged <- reader (handle . punctuation) mapM_ (tell . Text.pack . openTag) tags
put (Text.drop expectedLength focused)
mapM_ tell [before, maybe "" id openingTag, tagged]
else throwError (TermNotFound $ Text.unpack form)
where
expectedLength = Text.length form
handle f = (if expectedLength == 1 && f (Text.head form) then pc else w) expected
tokenLevel :: MonadError Error m => Maybe Text -> [TokenAnnotation] -> Editor m forget :: Monad m => Int64 -> Editor m
tokenLevel Nothing [] = pure () forget count = modify $
tokenLevel (Just t) [] = tell t \editorState -> editorState {input = Text.drop count $ input editorState}
tokenLevel openingTag (annotation@(TokenAnnotation {form}):others) =
((Text.breakOn form <$> get) >>= check openingTag annotation)
*> (tokenLevel Nothing others)
sentenceLevel :: MonadError Error m => SentenceAnnotation -> Editor m annotateNode :: MonadError Error m => Node -> Editor m
sentenceLevel sentenceAnnotation = annotateNode (Node {tag, inside}) =
tokenLevel (Just "<s>") (getTokens sentenceAnnotation) *> tell "</s>" enter tag *> annotator inside *> tell (Text.pack $ closeTag tag)
annotator :: MonadError Error m => Annotation -> Editor m
annotator (Token t) = gets (Text.breakOn t . input) >>= wrap
where
tokenLength = Text.length t
wrap (before, after) =
let totalLength = Text.length before + tokenLength in
if Text.length after >= tokenLength
then tell before *> flushTags *> tell t *> forget totalLength
else throwError (TermNotFound $ Text.unpack t)
annotator (Annotations l) = mapM_ annotateNode l
apply :: MonadError Error m => Config -> Annotation -> Text -> m Text apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
apply config annotation = apply config annotation = fmap snd . evalRWST annotate config . start
fmap snd . evalRWST (terms annotation *> get >>= tell) config
where where
terms (SentenceLevel sentences) = mapM_ sentenceLevel sentences start input = EditorState {input, tagStack = []}
terms (TokenLevel tokens) = tokenLevel Nothing tokens annotate = annotator annotation *> gets input >>= tell
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment