Skip to content
Snippets Groups Projects
Commit 9081ea79 authored by Alice Brenon's avatar Alice Brenon
Browse files

A little bit of gardening: splitting a module growing a little bit too bushy,...

A little bit of gardening: splitting a module growing a little bit too bushy, regrouping some conceptually linked operations
parent a3f12ffd
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,13 @@
module Main where
import CLI (Command(..), getCommand)
import Control.Applicative ((<|>))
import Control.Monad.Except (ExceptT(..), runExceptT)
import Data.Text.Lazy as Text (Text)
import Data.Text.Lazy.IO as Text (getContents, putStr)
import Text.TEIWA (Error, Origin(..), Source(..), annotateWith, coNLLX, csv, tsv)
import Text.TEIWA (
Error, Origin(..), Source(..), annotateWith, coNLLX, csv, tsv, webAnno
)
import System.FilePath (takeExtension)
import System.Exit (die)
......@@ -16,7 +19,7 @@ annotator (Command {annotationsFile, config}) =
extension = takeExtension annotationsFile
format
| extension == ".csv" = csv
| extension == ".tsv" = tsv
| extension == ".tsv" = webAnno <|> tsv
| otherwise = coNLLX
main :: IO ()
......
......@@ -6,9 +6,7 @@ module Text.TEIWA (
, module Source
, annotate
, annotateWith
, fromCSV
, fromCoNLLX
, fromWebAnno
, from
) where
import Control.Monad.Except (MonadError(..))
......@@ -30,11 +28,5 @@ annotateWith config source input = do
annotate :: (MonadError Error m, MonadIO m) => Source -> Filter m
annotate = annotateWith defaultConfig
fromCSV :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCSV = annotate . Source csv
fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCoNLLX = annotate . Source coNLLX
fromWebAnno :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromWebAnno = annotate . Source webAnno
from :: (MonadError Error m, MonadIO m) => Format -> Origin -> Filter m
from f = annotate . Source f
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Annotation (
Annotation(..)
, Attributes
, Node(..)
, Tag(..)
, apply
apply
, s_
, tagToken
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.RWS (RWST, evalRWST, gets, modify, tell)
import Data.Int (Int64)
import Data.Text.Lazy as Text (Text, breakOn, concat, drop, length, pack, unpack)
import Text.Printf (printf)
import Control.Monad.RWS (MonadReader, evalRWST, reader)
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Text.TEIWA.Annotation.Context (Context(..))
import Text.TEIWA.Annotation.Data (Annotation(..), Node(..), Tag(..))
import Text.TEIWA.Annotation.Editor (annotateWith, start)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (Attributes, Field, Row)
type Attributes = [(Text, Text)]
data Tag = Tag {
name :: Text
, annotated :: Attributes
} deriving Show
data Node = Node {
tag :: Tag
, inside :: Annotation
} deriving Show
data Annotation = Token Text | Annotations [Node] deriving Show
attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
openTag :: Tag -> Editor m
openTag (Tag {name, annotated}) = tell . Text.pack $
printf "<%s%s>" name . Text.concat $ attribute <$> annotated
closeTag :: Tag -> Editor m
closeTag = tell . Text.pack $ printf "</%s>" . name
data EditorState = EditorState {
input :: Text
, tagStack :: [Tag]
}
type Editor m = RWST Config Text EditorState m ()
editStack :: MonadState EditorState m => ([Tag] -> (a, [Tag])) -> m a
editStack f = state $ \editorState ->
let (output, newTagStack) = f $ tagStack editorState in
(output, editorState {tagStack = newTagStack})
flushTags :: Monad m => Editor m
flushTags = editStack (\stack -> (stack, [])) >>= mapM_ openTag
apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
apply config annotation =
fmap snd . evalRWST (annotateWith annotation) config . start
forget :: Monad m => Int64 -> Editor m
forget count = modify $
\editorState -> editorState {input = Text.drop count $ input editorState}
isTEI :: Text -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"])
annotateNode :: MonadError Error m => Node -> Editor m
annotateNode (Node {tag, inside}) =
editStack (\s -> ((), tag:s)) *> annotator inside *> closeTag tag
s_ :: Tag
s_ = Tag {name = "s", annotated = []}
annotator :: MonadError Error m => Annotation -> Editor m
annotator (Token t) = gets (Text.breakOn t . input) >>= wrap
filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where
tokenLength = Text.length t
wrap (before, after) =
let totalLength = Text.length before + tokenLength in
if Text.length after >= tokenLength
then tell before *> flushTags *> tell t *> forget totalLength
else throwError (TermNotFound $ Text.unpack t)
annotator (Annotations l) = mapM_ annotateNode l
filterIf _ ("", _) = []
filterIf _ (_, Nothing) = []
filterIf False (k, Just v) = [(k, v)]
filterIf True (k, Just v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
tagToken :: (MonadError Error m, MonadReader Config m) =>
Context -> Row -> m Node
tagToken (Context {columnIndex, columnName, header, tagger}) (atLine, record) =
case splitAt columnIndex record of
(before, (Just form):after) -> do
annotated <- filterAttributes (zip header $ before ++ after)
let name = tagger form annotated
pure $ Node {tag = Tag {name, annotated}, inside = Token form}
_ -> throwError . MissingColumn atLine $ Text.unpack columnName
apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
apply config annotation = fmap snd . evalRWST annotate config . start
where
start input = EditorState {input, tagStack = []}
annotate = annotator annotation *> gets input >>= tell
{-# LANGUAGE FlexibleContexts, NamedFieldPuns, OverloadedStrings #-}
module Text.TEIWA.Annotation.Context (
Context(..)
, ofHeader
) where
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..))
import Data.List (findIndex)
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Text.Parsec (Column)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (Attributes, Header, teiTagger)
data Context = Context {
columnIndex :: Column
, columnName :: Text
, header :: Header
, tagger :: Text -> Attributes -> Text
}
ofHeader :: (MonadError Error m, MonadReader Config m) =>
Header -> m Context
ofHeader defaultFields = do
Config {formColumn, headerOverride} <- ask
let headerFields = maybe defaultFields id headerOverride
column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields)
buildContext column headerFields
where
findForm = fmap Left . findIndex ((==) "form" . Text.toLower)
catchNothing = maybe (throwError NoFormColumn) return
buildContext :: MonadError Error m =>
Either Int Text -> Header -> m Context
buildContext column = build . either splitAt (break . (==)) column
where
build (_, []) =
throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column
build (before, columnName:after) =
let columnIndex = length before in
let header = (before ++ after) in
return $ Context {columnIndex, columnName, header, tagger = teiTagger}
{-# LANGUAGE NamedFieldPuns #-}
module Text.TEIWA.Annotation.Data (
Annotation(..)
, Format
, Node(..)
, Tag(..)
, openTag
, closeTag
) where
import Data.Text.Lazy as Text (Text, concat, pack)
import Text.TEIWA.Source.Common (Attributes, TEIWAParser, attribute)
import Text.Printf (printf)
data Tag = Tag {
name :: Text
, annotated :: Attributes
} deriving Show
data Node = Node {
tag :: Tag
, inside :: Annotation
} deriving Show
data Annotation = Token Text | Annotations [Node] deriving Show
type Format = TEIWAParser Annotation
openTag :: Tag -> Text
openTag (Tag {name, annotated}) =
Text.pack . printf "<%s%s>" name . Text.concat $ attribute <$> annotated
closeTag :: Tag -> Text
closeTag = Text.pack . printf "</%s>" . name
{-# LANGUAGE FlexibleContexts, NamedFieldPuns #-}
module Text.TEIWA.Annotation.Editor (
annotateWith
, start
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.RWS (MonadState, RWST, gets, modify, state, tell)
import Data.Int (Int64)
import Data.Text.Lazy as Text (Text, breakOn, drop, length, unpack)
import Text.TEIWA.Annotation.Data (
Annotation(..), Node(..), Tag(..), closeTag, openTag
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
data State = State {
input :: Text
, tagStack :: [Tag]
}
start :: Text -> State
start input = State {input, tagStack = []}
type Editor m = RWST Config Text State m ()
editStack :: MonadState State m => ([Tag] -> (a, [Tag])) -> m a
editStack f = state $ \editorState ->
let (output, newTagStack) = f $ tagStack editorState in
(output, editorState {tagStack = newTagStack})
flushTags :: Monad m => Editor m
flushTags = editStack (\stack -> (reverse stack, [])) >>= mapM_ (tell . openTag)
forget :: Monad m => Int64 -> Editor m
forget count = modify $
\editorState -> editorState {input = Text.drop count $ input editorState}
annotateNode :: MonadError Error m => Node -> Editor m
annotateNode (Node {tag, inside}) =
editStack (\s -> ((), tag:s)) *> annotator inside *> tell (closeTag tag)
annotator :: MonadError Error m => Annotation -> Editor m
annotator (Token t) = gets (Text.breakOn t . input) >>= wrap
where
tokenLength = Text.length t
wrap (before, after) =
let totalLength = Text.length before + tokenLength in
if Text.length after >= tokenLength
then tell before *> flushTags *> tell t *> forget totalLength
else throwError (TermNotFound $ Text.unpack t)
annotator (Annotations l) = mapM_ annotateNode l
annotateWith :: MonadError Error m => Annotation -> Editor m
annotateWith annotation = annotator annotation *> gets input >>= tell
......@@ -9,79 +9,30 @@ module Text.TEIWA.Source (
, csv
, runTEIWAParser
, parse
, ssv
, tsv
, webAnno
) where
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT(..))
import Control.Monad.Reader (ReaderT(..))
import Control.Monad.IO.Class (MonadIO(..))
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Data.Text.Lazy as Text (Text)
import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (SourceName, runParserT)
import Text.TEIWA.Annotation (
Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..)
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, TEIWAParser, annotationContext
)
import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences)
import qualified Text.TEIWA.Source.SSV as SSV (body, header)
import qualified Text.TEIWA.Source.WebAnno as WebAnno (header, sentences)
type Format = TEIWAParser Annotation
isTEI :: Text -> Bool
isTEI = (`elem` ["pos", "lemma", "msd"])
filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes
filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI
where
filterIf _ ("", _) = []
filterIf _ (_, Nothing) = []
filterIf False (k, Just v) = [(k, v)]
filterIf True (k, Just v) =
let k' = Text.toLower k in if isTEI k' then [(k', v)] else []
annotateToken :: (MonadError Error m, MonadReader Config m) =>
AnnotationContext -> Row -> m TokenAnnotation
annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
case splitAt columnIndex record of
(before, (Just form):after) ->
TokenAnnotation form <$> filterAttributes (zip header $ before ++ after)
_ -> throwError . MissingColumn atLine $ Text.unpack columnName
coNLLX :: Format
coNLLX = do
context <- annotationContext CoNLLX.header
SentenceLevel <$> (
CoNLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
)
)
ssv :: Char -> Format
ssv separator = do
context <- annotationContext =<< SSV.header separator
TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context))
import Text.TEIWA.Annotation.Data (Annotation, Format)
import Text.TEIWA.Source.Common (TEIWAParser)
import Text.TEIWA.Source.CoNLLX (coNLLX)
import Text.TEIWA.Source.SSV (ssv)
import Text.TEIWA.Source.WebAnno (webAnno)
csv :: Format
csv = ssv ','
tsv :: Format
tsv = webAnno <|> ssv '\t'
webAnno :: Format
webAnno = do
context <- annotationContext =<< WebAnno.header
SentenceLevel <$> (
WebAnno.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
)
)
tsv = ssv '\t'
data Origin = File FilePath | Text Text
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}
module Text.TEIWA.Source.CoNLLX (
header
, sentences
coNLLX
) where
import Control.Applicative ((<|>), many)
import Data.Text.Lazy as Text (concat, pack)
import Text.Parsec (ParsecT, Stream, char, digit, many1, noneOf, string, try)
import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine)
import Text.TEIWA.Annotation (s_, tagToken)
import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..))
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Source.Common (Field, Header, Row, eol, recordLine, sentences)
header :: Header
header = [
......@@ -28,7 +29,7 @@ field :: Stream s m Char => ParsecT s u m Field
field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just $ Text.pack s
build other = Just $ Text.pack other
type Range = (Int, Int)
......@@ -54,12 +55,11 @@ row = recordLine (rowID >>= either range singleLine)
combineFields f@(Just _) Nothing = f
combineFields (Just a) (Just b) = Just $ Text.concat [a, "+", b]
type Sentence = [Row]
sentence :: Stream s m Char => ParsecT s u m Sentence
sentence = many comment *> many1 row
where
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (sentence <* many1 eol)
coNLLX :: Format
coNLLX = do
context <- Context.ofHeader header
Annotations <$> (
sentences row >>= mapM (
fmap (Node s_ . Annotations) . mapM (tagToken context)
)
)
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}
module Text.TEIWA.Source.Common (
AnnotationContext(..)
Attributes
, Field
, Header
, Row
, TEIWAParser
, annotationContext
, attribute
, eol
, recordLine
, sentences
, teiTagger
) where
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..), ReaderT)
import Data.List (findIndex)
import Data.Text.Lazy as Text (Text, toLower, unpack)
import Control.Applicative ((<|>), many)
import Control.Monad.Reader (ReaderT)
import Data.Char (isPunctuation)
import Data.Text.Lazy as Text (Text, concat, head, length)
import Text.Parsec (
Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos
, try
Line, ParsecT, Stream, char, endOfLine, getParserState, many1, noneOf
, sourceLine, statePos, try
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
data AnnotationContext = AnnotationContext {
columnIndex :: Column
, columnName :: Text
, header :: Header
}
type Field = Maybe Text
type Header = [Text]
type Row = (Line, [Field])
type Sentence = [Row]
type Attributes = [(Text, Text)]
type TEIWAParser = ParsecT Text () (ReaderT Config (Either Error))
......@@ -44,23 +39,18 @@ recordLine p = (,) <$> currentLine <*> p
where
currentLine = sourceLine . statePos <$> getParserState
annotationContext :: Header -> TEIWAParser AnnotationContext
annotationContext defaultFields = do
Config {formColumn, headerOverride} <- ask
let headerFields = maybe defaultFields id headerOverride
column <- catchNothing ((Right <$> formColumn) <|> findForm headerFields)
buildContext column headerFields
where
findForm = fmap Left . findIndex ((==) "form" . Text.toLower)
catchNothing = maybe (throwError NoFormColumn) return
attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
buildContext :: MonadError Error m =>
Either Int Text -> Header -> m AnnotationContext
buildContext column = build . either splitAt (break . (==)) column
sentence :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m Sentence
sentence row = many comment *> many1 row
where
build (_, []) =
throwError . NoSuchColumn $ either (\_ -> "form") Text.unpack column
build (before, columnName:after) =
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m Row -> ParsecT s u m [Sentence]
sentences row = many (many1 eol *> sentence row)
teiTagger :: Text -> Attributes -> Text
teiTagger t _
| Text.length t == 1 && isPunctuation (Text.head t) = "pc"
| otherwise = "w"
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.SSV (
body
, header
ssv
) where
import Control.Applicative ((<|>), many)
......@@ -9,6 +8,9 @@ import Control.Monad.Except (MonadError(..))
import Data.Maybe (catMaybes)
import Data.Text.Lazy as Text (pack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Annotation (tagToken)
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Annotation.Data (Annotation(..), Format)
import Text.TEIWA.Source.Common (Field, Header, Row, TEIWAParser, eol, recordLine)
import Text.TEIWA.Error (Error(..))
......@@ -33,3 +35,8 @@ header separator = fields separator >>= ensureNotEmpty . catMaybes
body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields
ssv :: Char -> Format
ssv separator = do
context <- Context.ofHeader =<< header separator
Annotations <$> (mapM (tagToken context) =<< body separator)
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}
module Text.TEIWA.Source.WebAnno (
header
, sentences
webAnno
) where
import Control.Applicative (many)
......@@ -10,8 +8,11 @@ import Data.Text.Lazy as Text (pack)
import Text.Parsec (
ParsecT, Stream, char, digit, many1, noneOf, sepBy1, skipMany1, string
)
import Text.TEIWA.Annotation (s_, tagToken)
import Text.TEIWA.Annotation.Context as Context (ofHeader)
import Text.TEIWA.Annotation.Data (Annotation(..), Format, Node(..))
import Text.TEIWA.Source.Common (
Field, Header, Row, TEIWAParser, eol, recordLine
Field, Header, Row, TEIWAParser, eol, recordLine, sentences
)
header :: TEIWAParser Header
......@@ -32,20 +33,19 @@ field :: Stream s m Char => ParsecT s u m Field
field = build <$> many1 (noneOf "\t\n\r")
where
build "_" = Nothing
build s = Just . Text.pack $ unescape s
build other = Just . Text.pack $ unescape other
unescape "" = ""
unescape ('\\':c:s) = c:unescape s
unescape (c:s) = c:unescape s
unescape ('\\':c:cs) = c:unescape cs
unescape (c:cs) = c:unescape cs
row :: Stream s m Char => ParsecT s u m Row
row = recordLine (many1 (field <* char '\t') <* eol)
type Sentence = [Row]
sentence :: Stream s m Char => ParsecT s u m Sentence
sentence = many comment *> many1 row
where
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (many1 eol *> sentence)
webAnno :: Format
webAnno = do
context <- Context.ofHeader =<< header
Annotations <$> (
sentences row >>= mapM (
fmap (Node s_ . Annotations) . mapM (tagToken context)
)
)
......@@ -25,6 +25,9 @@ extra-source-files: CHANGELOG.md
library
exposed-modules: Text.TEIWA
other-modules: Text.TEIWA.Annotation
, Text.TEIWA.Annotation.Context
, Text.TEIWA.Annotation.Data
, Text.TEIWA.Annotation.Editor
, Text.TEIWA.Config
, Text.TEIWA.Error
, Text.TEIWA.Source
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment