diff --git a/app/Main.hs b/app/Main.hs index 6e7839869153b4ab3a2f254b0d35215b06b5388e..e9f574661ed5b4058fe5c9d91b380f634858ff05 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -2,10 +2,13 @@ module Main where import CLI (Command(..), getCommand) +import Control.Applicative ((<|>)) import Control.Monad.Except (ExceptT(..), runExceptT) import Data.Text.Lazy as Text (Text) import Data.Text.Lazy.IO as Text (getContents, putStr) -import Text.TEIWA (Error, Origin(..), Source(..), annotateWith, coNLLX, csv, tsv) +import Text.TEIWA ( + Error, Origin(..), Source(..), annotateWith, coNLLX, csv, tsv, webAnno + ) import System.FilePath (takeExtension) import System.Exit (die) @@ -16,7 +19,7 @@ annotator (Command {annotationsFile, config}) = extension = takeExtension annotationsFile format | extension == ".csv" = csv - | extension == ".tsv" = tsv + | extension == ".tsv" = webAnno <|> tsv | otherwise = coNLLX main :: IO () diff --git a/lib/Text/TEIWA.hs b/lib/Text/TEIWA.hs index d4ad823e5cdc672ec20a80ef841238c0c58e5411..0a6b212020cb6f3c899019322f7f383801c4a555 100644 --- a/lib/Text/TEIWA.hs +++ b/lib/Text/TEIWA.hs @@ -6,9 +6,7 @@ module Text.TEIWA ( , module Source , annotate , annotateWith - , fromCSV - , fromCoNLLX - , fromWebAnno + , from ) where import Control.Monad.Except (MonadError(..)) @@ -30,11 +28,5 @@ annotateWith config source input = do annotate :: (MonadError Error m, MonadIO m) => Source -> Filter m annotate = annotateWith defaultConfig -fromCSV :: (MonadError Error m, MonadIO m) => Origin -> Filter m -fromCSV = annotate . Source csv - -fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m -fromCoNLLX = annotate . Source coNLLX - -fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m -fromWebAnno = annotate . Source webAnno +from :: (MonadError Error m, MonadIO m) => Format -> Origin -> Filter m +from f = annotate . Source f diff --git a/lib/Text/TEIWA/Annotation.hs b/lib/Text/TEIWA/Annotation.hs index 8c1d502433afaf5f2878724dfbf8a9f2deeee61c..8fba39e04b15253e6c8c5ec860be9d80f6974efb 100644 --- a/lib/Text/TEIWA/Annotation.hs +++ b/lib/Text/TEIWA/Annotation.hs @@ -1,79 +1,46 @@ -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} module Text.TEIWA.Annotation ( - Annotation(..) - , Attributes - , Node(..) - , Tag(..) - , apply + apply + , s_ + , tagToken ) where import Control.Monad.Except (MonadError(..)) -import Control.Monad.RWS (RWST, evalRWST, gets, modify, tell) -import Data.Int (Int64) -import Data.Text.Lazy as Text (Text, breakOn, concat, drop, length, pack, unpack) -import Text.Printf (printf) +import Control.Monad.RWS (MonadReader, evalRWST, reader) +import Data.Text.Lazy as Text (Text, toLower, unpack) +import Text.TEIWA.Annotation.Context (Context(..)) +import Text.TEIWA.Annotation.Data (Annotation(..), Node(..), Tag(..)) +import Text.TEIWA.Annotation.Editor (annotateWith, start) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) +import Text.TEIWA.Source.Common (Attributes, Field, Row) -type Attributes = [(Text, Text)] - -data Tag = Tag { - name :: Text - , annotated :: Attributes - } deriving Show -data Node = Node { - tag :: Tag - , inside :: Annotation - } deriving Show -data Annotation = Token Text | Annotations [Node] deriving Show - -attribute :: (Text, Text) -> Text -attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""] - -openTag :: Tag -> Editor m -openTag (Tag {name, annotated}) = tell . Text.pack $ - printf "<%s%s>" name . Text.concat $ attribute <$> annotated - -closeTag :: Tag -> Editor m -closeTag = tell . Text.pack $ printf "</%s>" . name - -data EditorState = EditorState { - input :: Text - , tagStack :: [Tag] - } -type Editor m = RWST Config Text EditorState m () - -editStack :: MonadState EditorState m => ([Tag] -> (a, [Tag])) -> m a -editStack f = state $ \editorState -> - let (output, newTagStack) = f $ tagStack editorState in - (output, editorState {tagStack = newTagStack}) - -flushTags :: Monad m => Editor m -flushTags = editStack (\stack -> (stack, [])) >>= mapM_ openTag +apply :: MonadError Error m => Config -> Annotation -> Text -> m Text +apply config annotation = + fmap snd . evalRWST (annotateWith annotation) config . start -forget :: Monad m => Int64 -> Editor m -forget count = modify $ - \editorState -> editorState {input = Text.drop count $ input editorState} +isTEI :: Text -> Bool +isTEI = (`elem` ["pos", "lemma", "msd"]) -annotateNode :: MonadError Error m => Node -> Editor m -annotateNode (Node {tag, inside}) = - editStack (\s -> ((), tag:s)) *> annotator inside *> closeTag tag +s_ :: Tag +s_ = Tag {name = "s", annotated = []} -annotator :: MonadError Error m => Annotation -> Editor m -annotator (Token t) = gets (Text.breakOn t . input) >>= wrap +filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes +filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI where - tokenLength = Text.length t - wrap (before, after) = - let totalLength = Text.length before + tokenLength in - if Text.length after >= tokenLength - then tell before *> flushTags *> tell t *> forget totalLength - else throwError (TermNotFound $ Text.unpack t) -annotator (Annotations l) = mapM_ annotateNode l + filterIf _ ("", _) = [] + filterIf _ (_, Nothing) = [] + filterIf False (k, Just v) = [(k, v)] + filterIf True (k, Just v) = + let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] + +tagToken :: (MonadError Error m, MonadReader Config m) => + Context -> Row -> m Node +tagToken (Context {columnIndex, columnName, header, tagger}) (atLine, record) = + case splitAt columnIndex record of + (before, (Just form):after) -> do + annotated <- filterAttributes (zip header $ before ++ after) + let name = tagger form annotated + pure $ Node {tag = Tag {name, annotated}, inside = Token form} + _ -> throwError . MissingColumn atLine $ Text.unpack columnName -apply :: MonadError Error m => Config -> Annotation -> Text -> m Text -apply config annotation = fmap snd . evalRWST annotate config . start - where - start input = EditorState {input, tagStack = []} - annotate = annotator annotation *> gets input >>= tell diff --git a/lib/Text/TEIWA/Annotation/Context.hs b/lib/Text/TEIWA/Annotation/Context.hs new file mode 100644 index 0000000000000000000000000000000000000000..7a6e44ca228586a04518244d1fce8c37fa476c7a --- /dev/null +++ b/lib/Text/TEIWA/Annotation/Context.hs @@ -0,0 +1,44 @@ +{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-} +module Text.TEIWA.Annotation.Context ( + Context(..) + , ofHeader + ) where + +import Control.Applicative ((<|>)) +import Control.Monad.Except (MonadError(..)) +import Control.Monad.Reader (MonadReader(..)) +import Data.List (findIndex) +import Data.Text.Lazy as Text (Text, toLower, unpack) +import Text.Parsec (Column) +import Text.TEIWA.Config (Config(..)) +import Text.TEIWA.Error (Error(..)) +import Text.TEIWA.Source.Common (Attributes, Header, teiTagger) + +data Context = Context { + columnIndex :: Column + , columnName :: Text + , header :: Header + , tagger :: Text -> Attributes -> Text + } + +ofHeader :: (MonadError Error m, MonadReader Config m) => + Header -> m Context +ofHeader defaultFields = do + Config {formColumn, headerOverride} <- ask + let headerFields = maybe defaultFields id headerOverride + column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields) + buildContext column headerFields + where + findForm = fmap Left . findIndex ((==) "form" . Text.toLower) + catchNothing = maybe (throwError NoFormColumn) return + +buildContext :: MonadError Error m => + Either Int Text -> Header -> m Context +buildContext column = build . either splitAt (break . (==)) column + where + build (_, []) = + throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column + build (before, columnName:after) = + let columnIndex = length before in + let header = (before ++ after) in + return $ Context {columnIndex, columnName, header, tagger = teiTagger} diff --git a/lib/Text/TEIWA/Annotation/Data.hs b/lib/Text/TEIWA/Annotation/Data.hs new file mode 100644 index 0000000000000000000000000000000000000000..1599c61ba6ede3d52e5b1e13d53d0fe551377878 --- /dev/null +++ b/lib/Text/TEIWA/Annotation/Data.hs @@ -0,0 +1,31 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Text.TEIWA.Annotation.Data ( + Annotation(..) + , Format + , Node(..) + , Tag(..) + , openTag + , closeTag + ) where + +import Data.Text.Lazy as Text (Text, concat, pack) +import Text.TEIWA.Source.Common (Attributes, TEIWAParser, attribute) +import Text.Printf (printf) + +data Tag = Tag { + name :: Text + , annotated :: Attributes + } deriving Show +data Node = Node { + tag :: Tag + , inside :: Annotation + } deriving Show +data Annotation = Token Text | Annotations [Node] deriving Show +type Format = TEIWAParser Annotation + +openTag :: Tag -> Text +openTag (Tag {name, annotated}) = + Text.pack . printf "<%s%s>" name . Text.concat $ attribute <$> annotated + +closeTag :: Tag -> Text +closeTag = Text.pack . printf "</%s>" . name diff --git a/lib/Text/TEIWA/Annotation/Editor.hs b/lib/Text/TEIWA/Annotation/Editor.hs new file mode 100644 index 0000000000000000000000000000000000000000..78b24172beba302e20d8d90105321f91b1b8152c --- /dev/null +++ b/lib/Text/TEIWA/Annotation/Editor.hs @@ -0,0 +1,55 @@ +{-# LANGUAGE FlexibleContexts, NamedFieldPuns #-} +module Text.TEIWA.Annotation.Editor ( + annotateWith + , start + ) where + +import Control.Monad.Except (MonadError(..)) +import Control.Monad.RWS (MonadState, RWST, gets, modify, state, tell) +import Data.Int (Int64) +import Data.Text.Lazy as Text (Text, breakOn, drop, length, unpack) +import Text.TEIWA.Annotation.Data ( + Annotation(..), Node(..), Tag(..), closeTag, openTag + ) +import Text.TEIWA.Config (Config(..)) +import Text.TEIWA.Error (Error(..)) + +data State = State { + input :: Text + , tagStack :: [Tag] + } + +start :: Text -> State +start input = State {input, tagStack = []} + +type Editor m = RWST Config Text State m () + +editStack :: MonadState State m => ([Tag] -> (a, [Tag])) -> m a +editStack f = state $ \editorState -> + let (output, newTagStack) = f $ tagStack editorState in + (output, editorState {tagStack = newTagStack}) + +flushTags :: Monad m => Editor m +flushTags = editStack (\stack -> (reverse stack, [])) >>= mapM_ (tell . openTag) + +forget :: Monad m => Int64 -> Editor m +forget count = modify $ + \editorState -> editorState {input = Text.drop count $ input editorState} + +annotateNode :: MonadError Error m => Node -> Editor m +annotateNode (Node {tag, inside}) = + editStack (\s -> ((), tag:s)) *> annotator inside *> tell (closeTag tag) + +annotator :: MonadError Error m => Annotation -> Editor m +annotator (Token t) = gets (Text.breakOn t . input) >>= wrap + where + tokenLength = Text.length t + wrap (before, after) = + let totalLength = Text.length before + tokenLength in + if Text.length after >= tokenLength + then tell before *> flushTags *> tell t *> forget totalLength + else throwError (TermNotFound $ Text.unpack t) +annotator (Annotations l) = mapM_ annotateNode l + +annotateWith :: MonadError Error m => Annotation -> Editor m +annotateWith annotation = annotator annotation *> gets input >>= tell diff --git a/lib/Text/TEIWA/Source.hs b/lib/Text/TEIWA/Source.hs index d6b9f66abbd52ea06780d36982b9cebf99b45e2c..4fcb725c716b911947d5552e77fbfecff2b9d75a 100644 --- a/lib/Text/TEIWA/Source.hs +++ b/lib/Text/TEIWA/Source.hs @@ -9,79 +9,30 @@ module Text.TEIWA.Source ( , csv , runTEIWAParser , parse + , ssv , tsv , webAnno ) where -import Control.Applicative ((<|>)) import Control.Monad.Except (MonadError(..)) -import Control.Monad.Reader (MonadReader(..), ReaderT(..)) +import Control.Monad.Reader (ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) -import Data.Text.Lazy as Text (Text, toLower, unpack) +import Data.Text.Lazy as Text (Text) import Data.Text.Lazy.IO as Text (readFile) import Text.Parsec (SourceName, runParserT) -import Text.TEIWA.Annotation ( - Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..) - ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -import Text.TEIWA.Source.Common ( - AnnotationContext(..), Field, Row, TEIWAParser, annotationContext - ) -import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) -import qualified Text.TEIWA.Source.SSV as SSV (body, header) -import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences) - -type Format = TEIWAParser Annotation - -isTEI :: Text -> Bool -isTEI = (`elem` ["pos", "lemma", "msd"]) - -filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes -filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI - where - filterIf _ ("", _) = [] - filterIf _ (_, Nothing) = [] - filterIf False (k, Just v) = [(k, v)] - filterIf True (k, Just v) = - let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] - -annotateToken :: (MonadError Error m, MonadReader Config m) => - AnnotationContext -> Row -> m TokenAnnotation -annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = - case splitAt columnIndex record of - (before, (Just form):after) -> - TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) - _ -> throwError . MissingColumn atLine $ Text.unpack columnName - -coNLLX :: Format -coNLLX = do - context <- annotationContext CoNLLX.header - SentenceLevel <$> ( - CoNLLX.sentences >>= mapM ( - fmap SentenceAnnotation . mapM (annotateToken context) - ) - ) - -ssv :: Char -> Format -ssv separator = do - context <- annotationContext =<< SSV.header separator - TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) +import Text.TEIWA.Annotation.Data (Annotation, Format) +import Text.TEIWA.Source.Common (TEIWAParser) +import Text.TEIWA.Source.CoNLLX (coNLLX) +import Text.TEIWA.Source.SSV (ssv) +import Text.TEIWA.Source.WebAnno (webAnno) csv :: Format csv = ssv ',' tsv :: Format -tsv = webAnno <|> ssv '\t' - -webAnno :: Format -webAnno = do - context <- annotationContext =<< WebAnno.header - SentenceLevel <$> ( - WebAnno.sentences >>= mapM ( - fmap SentenceAnnotation . mapM (annotateToken context) - ) - ) +tsv = ssv '\t' data Origin = File FilePath | Text Text diff --git a/lib/Text/TEIWA/Source/CoNLLX.hs b/lib/Text/TEIWA/Source/CoNLLX.hs index 0a7f95733ee7f13ae971648467732475b310db85..28b4b490d7c0c778b2a63299b6760a9521f5c740 100644 --- a/lib/Text/TEIWA/Source/CoNLLX.hs +++ b/lib/Text/TEIWA/Source/CoNLLX.hs @@ -1,14 +1,15 @@ -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} module Text.TEIWA.Source.CoNLLX ( - header - , sentences + coNLLX ) where import Control.Applicative ((<|>), many) import Data.Text.Lazy as Text (concat, pack) import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try) -import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine) +import Text.TEIWA.Annotation (s_, tagToken) +import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..)) +import Text.TEIWA.Annotation.Context as Context (ofHeader) +import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine, sentences) header :: Header header = [ @@ -28,7 +29,7 @@ field :: Stream s m Char => ParsecT s u m Field field = build <$> many1 (noneOf "\t\n\r") where build "_" = Nothing - build s = Just $ Text.pack s + build other = Just $ Text.pack other type Range = (Int, Int) @@ -54,12 +55,11 @@ row = recordLine (rowID >>= either range singleLine) combineFields f@(Just _) Nothing = f combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b] -type Sentence = [Row] - -sentence :: Stream s m Char => ParsecT s u m Sentence -sentence = many comment *> many1 row - where - comment = char '#' *> many (noneOf "\r\n") <* eol - -sentences :: Stream s m Char => ParsecT s u m [Sentence] -sentences = many (sentence <* many1 eol) +coNLLX :: Format +coNLLX = do + context <- Context.ofHeader header + Annotations <$> ( + sentences row >>= mapM ( + fmap (Node s_ . Annotations) . mapM (tagToken context) + ) + ) diff --git a/lib/Text/TEIWA/Source/Common.hs b/lib/Text/TEIWA/Source/Common.hs index 122da7e96b039e1447386aa5ecf0928d922f62ab..787ac02604d02abcb6e4606b2882ce7f5183204a 100644 --- a/lib/Text/TEIWA/Source/Common.hs +++ b/lib/Text/TEIWA/Source/Common.hs @@ -1,38 +1,33 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} module Text.TEIWA.Source.Common ( - AnnotationContext(..) + Attributes , Field , Header , Row , TEIWAParser - , annotationContext + , attribute , eol , recordLine + , sentences + , teiTagger ) where -import Control.Applicative ((<|>)) -import Control.Monad.Except (MonadError(..)) -import Control.Monad.Reader (MonadReader(..), ReaderT) -import Data.List (findIndex) -import Data.Text.Lazy as Text (Text, toLower, unpack) +import Control.Applicative ((<|>), many) +import Control.Monad.Reader (ReaderT) +import Data.Char (isPunctuation) +import Data.Text.Lazy as Text (Text, concat, head, length) import Text.Parsec ( - Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos - , try + Line, ParsecT, Stream, char, endOfLine, getParserState, many1, noneOf + , sourceLine, statePos, try ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) -data AnnotationContext = AnnotationContext { - columnIndex :: Column - , columnName :: Text - , header :: Header - } - type Field = Maybe Text type Header = [Text] type Row = (Line, [Field]) +type Sentence = [Row] +type Attributes = [(Text, Text)] type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error)) @@ -44,23 +39,18 @@ recordLine p = (,) <$> currentLine <*> p where currentLine = sourceLine . statePos <$> getParserState -annotationContext :: Header -> TEIWAParser AnnotationContext -annotationContext defaultFields = do - Config {formColumn, headerOverride} <- ask - let headerFields = maybe defaultFields id headerOverride - column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields) - buildContext column headerFields - where - findForm = fmap Left . findIndex ((==) "form" . Text.toLower) - catchNothing = maybe (throwError NoFormColumn) return +attribute :: (Text, Text) -> Text +attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""] -buildContext :: MonadError Error m => - Either Int Text -> Header -> m AnnotationContext -buildContext column = build . either splitAt (break . (==)) column +sentence :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m Sentence +sentence row = many comment *> many1 row where - build (_, []) = - throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column - build (before, columnName:after) = - let columnIndex = length before in - let header = (before ++ after) in - return $ AnnotationContext {columnIndex, columnName, header} + comment = char '#' *> many (noneOf "\r\n") <* eol + +sentences :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m [Sentence] +sentences row = many (many1 eol *> sentence row) + +teiTagger :: Text -> Attributes -> Text +teiTagger t _ + | Text.length t == 1 && isPunctuation (Text.head t) = "pc" + | otherwise = "w" diff --git a/lib/Text/TEIWA/Source/SSV.hs b/lib/Text/TEIWA/Source/SSV.hs index b973771d5a9fed19bffd934ba0e38ddc11bc44a2..ed5d3e0a082f8ca98bc4423077f8cbfe1b3848bb 100644 --- a/lib/Text/TEIWA/Source/SSV.hs +++ b/lib/Text/TEIWA/Source/SSV.hs @@ -1,7 +1,6 @@ {-# LANGUAGE FlexibleContexts #-} module Text.TEIWA.Source.SSV ( - body - , header + ssv ) where import Control.Applicative ((<|>), many) @@ -9,6 +8,9 @@ import Control.Monad.Except (MonadError(..)) import Data.Maybe (catMaybes) import Data.Text.Lazy as Text (pack) import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try) +import Text.TEIWA.Annotation (tagToken) +import Text.TEIWA.Annotation.Context as Context (ofHeader) +import Text.TEIWA.Annotation.Data (Annotation(..), Format) import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine) import Text.TEIWA.Error (Error(..)) @@ -33,3 +35,8 @@ header separator = fields separator >>= ensureNotEmpty . catMaybes body :: Stream s m Char => Char -> ParsecT s u m [Row] body = many . recordLine . fields + +ssv :: Char -> Format +ssv separator = do + context <- Context.ofHeader =<< header separator + Annotations <$> (mapM (tagToken context) =<< body separator) diff --git a/lib/Text/TEIWA/Source/WebAnno.hs b/lib/Text/TEIWA/Source/WebAnno.hs index a20a2a91ba4d6f727d8eca6eaa78cb3ddb3b96bc..ed75a3eff5c234e5fe6c60d57e8e5a1079210ebb 100644 --- a/lib/Text/TEIWA/Source/WebAnno.hs +++ b/lib/Text/TEIWA/Source/WebAnno.hs @@ -1,8 +1,6 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts, OverloadedStrings #-} module Text.TEIWA.Source.WebAnno ( - header - , sentences + webAnno ) where import Control.Applicative (many) @@ -10,8 +8,11 @@ import Data.Text.Lazy as Text (pack) import Text.Parsec ( ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string ) +import Text.TEIWA.Annotation (s_, tagToken) +import Text.TEIWA.Annotation.Context as Context (ofHeader) +import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..)) import Text.TEIWA.Source.Common ( - Field, Header, Row, TEIWAParser, eol, recordLine + Field, Header, Row, TEIWAParser, eol, recordLine, sentences ) header :: TEIWAParser Header @@ -32,20 +33,19 @@ field :: Stream s m Char => ParsecT s u m Field field = build <$> many1 (noneOf "\t\n\r") where build "_" = Nothing - build s = Just . Text.pack $ unescape s + build other = Just . Text.pack $ unescape other unescape "" = "" - unescape ('\\':c:s) = c:unescape s - unescape (c:s) = c:unescape s + unescape ('\\':c:cs) = c:unescape cs + unescape (c:cs) = c:unescape cs row :: Stream s m Char => ParsecT s u m Row row = recordLine (many1 (field <* char '\t') <* eol) -type Sentence = [Row] - -sentence :: Stream s m Char => ParsecT s u m Sentence -sentence = many comment *> many1 row - where - comment = char '#' *> many (noneOf "\r\n") <* eol - -sentences :: Stream s m Char => ParsecT s u m [Sentence] -sentences = many (many1 eol *> sentence) +webAnno :: Format +webAnno = do + context <- Context.ofHeader =<< header + Annotations <$> ( + sentences row >>= mapM ( + fmap (Node s_ . Annotations) . mapM (tagToken context) + ) + ) diff --git a/teiwa.cabal b/teiwa.cabal index bfd3856cd6f55aadd3d36ef0012ec2d674fdbe59..ded3950c929c5ca5f73f910f6fb7172819a24883 100644 --- a/teiwa.cabal +++ b/teiwa.cabal @@ -25,6 +25,9 @@ extra-source-files: CHANGELOG.md library exposed-modules: Text.TEIWA other-modules: Text.TEIWA.Annotation + , Text.TEIWA.Annotation.Context + , Text.TEIWA.Annotation.Data + , Text.TEIWA.Annotation.Editor , Text.TEIWA.Config , Text.TEIWA.Error , Text.TEIWA.Source