diff --git a/lib/Text/TEIWA/Annotation.hs b/lib/Text/TEIWA/Annotation.hs index 12e3aec9e8f9ff41eb68f449c5159afa58ba4a28..6fd9f82515972c5d076bd4d16887ad73e63d50c2 100644 --- a/lib/Text/TEIWA/Annotation.hs +++ b/lib/Text/TEIWA/Annotation.hs @@ -4,76 +4,78 @@ module Text.TEIWA.Annotation ( Annotation(..) , Attributes - , SentenceAnnotation(..) - , TokenAnnotation(..) + , Node(..) + , Tag(..) , apply ) where import Control.Monad.Except (MonadError(..)) -import Control.Monad.RWS (RWST, evalRWST, get, put, reader, tell) -import Data.Text.Lazy as Text (Text, breakOn, concat, drop, head, length, unpack) +import Control.Monad.RWS (RWST, evalRWST, gets, modify, tell) +import Data.Int (Int64) +import Data.Text.Lazy as Text (Text, breakOn, concat, drop, length, pack, unpack) +import Text.Printf (printf) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) type Attributes = [(Text, Text)] -data TokenAnnotation = TokenAnnotation { - form :: Text +data Tag = Tag { + name :: Text , annotated :: Attributes } deriving Show - -newtype SentenceAnnotation = SentenceAnnotation { - getTokens :: [TokenAnnotation] - } - -data Annotation = - SentenceLevel [SentenceAnnotation] - | TokenLevel [TokenAnnotation] +data Node = Node { + tag :: Tag + , inside :: Annotation + } deriving Show +data Annotation = Token Text | Annotations [Node] deriving Show attribute :: (Text, Text) -> Text attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""] -token :: Text -> TokenAnnotation -> Text -token elementName (TokenAnnotation {form, annotated}) = - Text.concat (openTag ++ (form:closeTag)) - where - openTag = ("<":elementName:(attribute <$> annotated) ++ [">"]) - closeTag = ["</", elementName,">"] +openTag :: Tag -> String +openTag (Tag {name, annotated}) = + printf "<%s%s>" name . Text.concat $ attribute <$> annotated -type Editor m = RWST Config Text Text m () +closeTag :: Tag -> String +closeTag = printf "</%s>" . name -pc :: TokenAnnotation -> Text -pc = token "pc" +data EditorState = EditorState { + input :: Text + , tagStack :: [Tag] + } +type Editor m = RWST Config Text EditorState m () -w :: TokenAnnotation -> Text -w = token "w" +enter :: Monad m => Tag -> Editor m +enter t = modify $ + \editorState -> editorState {tagStack = t:tagStack editorState} -check :: MonadError Error m => Maybe Text -> TokenAnnotation -> (Text, Text) -> Editor m -check openingTag expected@(TokenAnnotation {form}) (before, focused) = - if Text.length focused >= expectedLength - then do - tagged <- reader (handle . punctuation) - put (Text.drop expectedLength focused) - mapM_ tell [before, maybe "" id openingTag, tagged] - else throwError (TermNotFound $ Text.unpack form) - where - expectedLength = Text.length form - handle f = (if expectedLength == 1 && f (Text.head form) then pc else w) expected +flushTags :: Monad m => Editor m +flushTags = do + tags <- gets tagStack + modify $ \editorState -> editorState {tagStack = []} + mapM_ (tell . Text.pack . openTag) tags -tokenLevel :: MonadError Error m => Maybe Text -> [TokenAnnotation] -> Editor m -tokenLevel Nothing [] = pure () -tokenLevel (Just t) [] = tell t -tokenLevel openingTag (annotation@(TokenAnnotation {form}):others) = - ((Text.breakOn form <$> get) >>= check openingTag annotation) - *> (tokenLevel Nothing others) +forget :: Monad m => Int64 -> Editor m +forget count = modify $ + \editorState -> editorState {input = Text.drop count $ input editorState} -sentenceLevel :: MonadError Error m => SentenceAnnotation -> Editor m -sentenceLevel sentenceAnnotation = - tokenLevel (Just "<s>") (getTokens sentenceAnnotation) *> tell "</s>" +annotateNode :: MonadError Error m => Node -> Editor m +annotateNode (Node {tag, inside}) = + enter tag *> annotator inside *> tell (Text.pack $ closeTag tag) + +annotator :: MonadError Error m => Annotation -> Editor m +annotator (Token t) = gets (Text.breakOn t . input) >>= wrap + where + tokenLength = Text.length t + wrap (before, after) = + let totalLength = Text.length before + tokenLength in + if Text.length after >= tokenLength + then tell before *> flushTags *> tell t *> forget totalLength + else throwError (TermNotFound $ Text.unpack t) +annotator (Annotations l) = mapM_ annotateNode l apply :: MonadError Error m => Config -> Annotation -> Text -> m Text -apply config annotation = - fmap snd . evalRWST (terms annotation *> get >>= tell) config +apply config annotation = fmap snd . evalRWST annotate config . start where - terms (SentenceLevel sentences) = mapM_ sentenceLevel sentences - terms (TokenLevel tokens) = tokenLevel Nothing tokens + start input = EditorState {input, tagStack = []} + annotate = annotator annotation *> gets input >>= tell