Skip to content
Snippets Groups Projects
Commit f3fdb961 authored by Alice Brenon's avatar Alice Brenon
Browse files

First working version, missing <pc> and strict TEI

parents
No related branches found
No related tags found
No related merge requests found
# ---> Haskell
dist
dist-*
cabal-dev
*.o
*.hi
*.chi
*.chs.h
*.dyn_o
*.dyn_hi
.hpc
.hsenv
.cabal-sandbox/
cabal.sandbox.config
*.prof
*.aux
*.hp
*.eventlog
.stack-work/
cabal.project.local
cabal.project.local~
.HTF/
.ghc.environment.*
# Revision history for teiwa
## 0.1.0.0 -- YYYY-mm-dd
* First version. Released on an unsuspecting world.
LICENSE 0 → 100644
Copyright (c) 2021, Alice BRENON
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Alice BRENON nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import Distribution.Simple
main = defaultMain
{-# LANGUAGE NamedFieldPuns #-}
module CLI (
Command(..)
, getCommand
) where
import Data.Version (showVersion)
import Control.Applicative ((<*>), optional)
import Options.Applicative (
Parser, ReadM, strArgument, execParser, fullDesc, header, help, helper
, info, long, metavar, option, short, str, switch, value
)
import qualified Paths_teiwa as Teiwa (version)
import Text.TEIWA (Config(..), defaultConfig)
data Command = Command {
annotationsFile :: FilePath
, config :: Config
}
charPredicate :: ReadM (Char -> Bool)
charPredicate = flip elem <$> (str :: ReadM String)
configOptions :: Parser Config
configOptions = Config
<$> option (optional str) (short 'c' <> long "formColumn" <> value formColumn
<> help "the column to use as the form"
)
<*> option charPredicate (short 'p' <> long "punctuation" <> value punctuation
<> help "characters to encode as punctuation (defaults to Data.Char.isPunctuation)"
)
<*> switch (short 's' <> long "strictTEI"
<> help "only use TEI's att.linguistic on the elements"
)
where
Config {formColumn, punctuation} = defaultConfig
command :: Parser Command
command = Command
<$> strArgument (metavar "ANNOTATIONS" <> help "annotations to apply")
<*> configOptions
getCommand :: IO Command
getCommand = execParser $
info
(helper <*> command)
(fullDesc <> header ("teiwa v" ++ showVersion Teiwa.version))
{-# LANGUAGE NamedFieldPuns #-}
module Main where
import CLI (Command(..), getCommand)
import Control.Monad.Except (ExceptT(..), runExceptT)
import Data.Text.Lazy as Text (Text)
import Data.Text.Lazy.IO as Text (getContents, putStr)
import Text.TEIWA (Error, Origin(..), Source(..), annotateWith, coNLLX, csv, tsv)
import System.FilePath (takeExtension)
import System.Exit (die)
annotator :: Command -> Text -> ExceptT Error IO Text
annotator (Command {annotationsFile, config}) =
annotateWith config $ Source format (File annotationsFile)
where
extension = takeExtension annotationsFile
format
| extension == ".csv" = csv
| extension == ".tsv" = tsv
| otherwise = coNLLX
main :: IO ()
main = do
command <- getCommand
input <- Text.getContents
runExceptT (annotator command input) >>= either (die . show) Text.putStr
where
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA (
module Annotation
, module Text.TEIWA.Config
, module Text.TEIWA.Error
, module Source
, annotate
, annotateWith
, fromCSV
, fromCoNLLX
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.IO.Class (MonadIO(..))
import Data.Text.Lazy as Text (Text)
import Text.TEIWA.Annotation as Annotation
import Text.TEIWA.Config
import Text.TEIWA.Error
import Text.TEIWA.Source as Source
type Filter m = Text -> m Text
annotateWith :: (MonadError Error m, MonadIO m) =>
Config -> Source -> Filter m
annotateWith config source input = do
annotation <- Source.parse config source
Annotation.apply config annotation input
annotate :: (MonadError Error m, MonadIO m) => Source -> Filter m
annotate = annotateWith defaultConfig
fromCSV :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCSV = annotate . Source csv
fromCoNLLX :: (MonadError Error m, MonadIO m) => Origin -> Filter m
fromCoNLLX = annotate . Source coNLLX
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Annotation (
Annotation(..)
, Attributes
, SentenceAnnotation(..)
, TokenAnnotation(..)
, apply
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.RWS (RWST, evalRWST, get, put, tell)
import Data.Text.Lazy as Text (Text, breakOn, concat, drop, length, unpack)
import Text.TEIWA.Config (Config)
import Text.TEIWA.Error (Error(..))
type Attributes = [(Text, Text)]
data TokenAnnotation = TokenAnnotation {
form :: Text
, annotated :: Attributes
} deriving Show
newtype SentenceAnnotation = SentenceAnnotation {
getTokens :: [TokenAnnotation]
}
data Annotation =
SentenceLevel [SentenceAnnotation]
| TokenLevel [TokenAnnotation]
attribute :: (Text, Text) -> Text
attribute (k, v) = Text.concat [" ", k, "=\"", v, "\""]
token :: Text -> TokenAnnotation -> Text
token elementName (TokenAnnotation {form, annotated}) =
Text.concat (openTag ++ (form:closeTag))
where
openTag = ("<":elementName:(attribute <$> annotated) ++ [">"])
closeTag = ["</", elementName,">"]
type Editor m = RWST Config Text Text m ()
pc :: TokenAnnotation -> Text
pc = token "pc"
w :: TokenAnnotation -> Text
w = token "w"
check :: MonadError Error m => Maybe Text -> TokenAnnotation -> (Text, Text) -> Editor m
check openingTag expected@(TokenAnnotation {form}) (before, focused) =
let expectedLength = Text.length form in
if Text.length focused >= expectedLength
then put (Text.drop expectedLength focused)
*> mapM_ tell [before, maybe "" id openingTag, w expected]
else throwError (TermNotFound $ Text.unpack form)
tokenLevel :: MonadError Error m => Maybe Text -> [TokenAnnotation] -> Editor m
tokenLevel Nothing [] = pure ()
tokenLevel (Just t) [] = tell t
tokenLevel openingTag (annotation@(TokenAnnotation {form}):others) =
((Text.breakOn form <$> get) >>= check openingTag annotation)
*> (tokenLevel Nothing others)
sentenceLevel :: MonadError Error m => SentenceAnnotation -> Editor m
sentenceLevel sentenceAnnotation =
tokenLevel (Just "<s>") (getTokens sentenceAnnotation) *> tell "</s>"
apply :: MonadError Error m => Config -> Annotation -> Text -> m Text
apply config annotation =
fmap snd . evalRWST (terms annotation *> get >>= tell) config
where
terms (SentenceLevel sentences) = mapM_ sentenceLevel sentences
terms (TokenLevel tokens) = tokenLevel Nothing tokens
module Text.TEIWA.Config (
Config(..)
, defaultConfig
) where
import Data.Char (isPunctuation)
import Data.Text.Lazy (Text)
data Config = Config {
formColumn :: Maybe Text
, punctuation :: Char -> Bool
, strictTEI :: Bool
}
defaultConfig :: Config
defaultConfig = Config {
formColumn = Nothing
, punctuation = isPunctuation
, strictTEI = False
}
module Text.TEIWA.Error (
Error(..)
) where
import Text.Parsec (Line, ParseError)
import Text.Printf (printf)
data Error =
NoSuchColumn String
| EmptyHeader
| MissingColumn Line String
| ParsingError ParseError
| TermNotFound String
instance Show Error where
show (NoSuchColumn s) = printf "\"%s\" isn't a valid column name in this file" s
show EmptyHeader = printf "The CSV file header is empty"
show (MissingColumn l s) = printf "Line %d is missing a value for column %s" l s
show (ParsingError e) = show e
show (TermNotFound t) = printf "Annotated term \"%s\" wasn't found in the input" t
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source (
Format
, Origin(..)
, TEIWAParser
, Source(..)
, coNLLX
, csv
, runTEIWAParser
, parse
, tsv
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.IO.Class (MonadIO(..))
import Data.Text.Lazy as Text (Text, unpack)
import Data.Text.Lazy.IO as Text (readFile)
import Text.Parsec (ParsecT, SourceName, runParserT)
import Text.TEIWA.Annotation (
Annotation(..), SentenceAnnotation(..), TokenAnnotation(..)
)
import Text.TEIWA.Config (Config(..))
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (AnnotationContext(..), Row)
import qualified Text.TEIWA.Source.ConLLX as ConLLX (getContext, sentences)
import qualified Text.TEIWA.Source.CSV as CSV (body, getContext)
type TEIWAParser = ParsecT Text () (Either Error)
type Format = Config -> TEIWAParser Annotation
annotateToken :: MonadError Error m =>
AnnotationContext -> Row -> m TokenAnnotation
annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) =
case splitAt columnIndex record of
(_, []) -> throwError . MissingColumn atLine $ Text.unpack columnName
(before, form:after) ->
return $ TokenAnnotation {form, annotated = zip header (before ++ after)}
coNLLX :: Format
coNLLX (Config {formColumn}) = do
context <- ConLLX.getContext formColumn
SentenceLevel <$> (
ConLLX.sentences >>= mapM (
fmap SentenceAnnotation . mapM (annotateToken context)
)
)
csv :: Format
csv (Config {formColumn}) = do
context <- CSV.getContext ',' formColumn
TokenLevel <$> (CSV.body ',' >>= mapM (annotateToken context))
tsv :: Format
tsv (Config {formColumn}) = do
context <- CSV.getContext '\t' formColumn
TokenLevel <$> (CSV.body '\t' >>= mapM (annotateToken context))
data Origin = File FilePath | Text Text
data Source = Source {
format :: Format
, origin :: Origin
}
runTEIWAParser :: MonadError Error m =>
TEIWAParser a -> SourceName -> Text -> m a
runTEIWAParser p s = flattenErrors . runParserT p () s
where
flattenErrors = either throwError (either (throwError . ParsingError) pure)
parse :: (MonadIO m, MonadError Error m) =>
Config -> Source -> m Annotation
parse config (Source {format, origin}) = parseFrom (format config) origin
where
parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser p f
parseFrom p (Text t) = runTEIWAParser p "" t
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.CSV (
body
, getContext
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Data.Text.Lazy as Text (Text, pack, unpack)
import Text.Parsec (ParsecT, Stream, between, char, noneOf, sepBy, string, try)
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
field :: Stream s m Char => Char -> ParsecT s u m Field
field separator = Text.pack <$> (regular <|> quoted)
where
regular = many (noneOf $ separator:"\n\r\"")
quoted = between quote quote $
many (noneOf "\"" <|> try (string "\"\"" *> pure '"'))
quote = char '"'
fields :: Stream s m Char => Char -> ParsecT s u m [Field]
fields separator = (field separator `sepBy` char separator) <* eol
body :: Stream s m Char => Char -> ParsecT s u m [Row]
body = many . recordLine . fields
getContext :: (Stream s m Char, MonadError Error m) =>
Char -> Maybe Text -> ParsecT s u m AnnotationContext
getContext separator formColumn = fields separator >>= aux formColumn
where
aux Nothing [] = throwError EmptyHeader
aux Nothing (columnName:header) =
return $ AnnotationContext {columnIndex = 0, columnName, header}
aux (Just columnName) headerRecord =
case break (== columnName) headerRecord of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.Common (
AnnotationContext(..)
, Field
, Row
, eol
, recordLine
) where
import Control.Applicative ((<|>))
import Data.Text.Lazy as Text (Text)
import Text.Parsec (
Column, Line, ParsecT, Stream, char, endOfLine, getParserState, sourceLine, statePos
, try
)
data AnnotationContext = AnnotationContext {
columnIndex :: Column
, columnName :: Text
, header :: Header
}
type Field = Text
type Header = [Text]
type Row = (Line, [Field])
eol :: Stream s m Char => ParsecT s u m ()
eol = (try endOfLine <|> char '\r') *> return ()
recordLine :: Monad m => ParsecT s u m a -> ParsecT s u m (Line, a)
recordLine p = (,) <$> currentLine <*> p
where
currentLine = sourceLine . statePos <$> getParserState
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Text.TEIWA.Source.ConLLX (
getContext
, sentences
) where
import Control.Applicative (many)
import Control.Monad.Except (MonadError(..))
import Data.Text.Lazy as Text (Text, pack, unpack)
import Text.Parsec (ParsecT, Stream, char, many1, noneOf, sepBy1)
import Text.TEIWA.Error (Error(..))
import Text.TEIWA.Source.Common (
AnnotationContext(..), Field, Row, eol, recordLine
)
fields :: [Field]
fields = [
"ID"
, "FORM"
, "LEMMA"
, "CPOSTAG"
, "POSTAG"
, "FEATS"
, "HEAD"
, "DEPREL"
, "PHEAD"
, "PDEPREL"
]
field :: Stream s m Char => ParsecT s u m Field
field = Text.pack <$> many1 (noneOf "\t\n\r")
row :: Stream s m Char => ParsecT s u m Row
row = recordLine (field `sepBy1` char '\t') <* eol
type Sentence = [Row]
sentence :: Stream s m Char => ParsecT s u m Sentence
sentence = many comment *> many1 row
where
comment = char '#' *> many (noneOf "\r\n") <* eol
sentences :: Stream s m Char => ParsecT s u m [Sentence]
sentences = many (sentence <* many1 eol)
getContext :: MonadError Error m => Maybe Text -> m AnnotationContext
getContext Nothing = return $ AnnotationContext {
columnIndex = 1
, columnName = "FORM"
, header = take 1 fields ++ drop 2 fields
}
getContext (Just columnName) =
case break (== columnName) fields of
(_, []) -> throwError . NoSuchColumn $ Text.unpack columnName
(before, _:after) ->
let columnIndex = length before in
let header = (before ++ after) in
return $ AnnotationContext {columnIndex, columnName, header}
cabal-version: >=1.10
-- Initial package description 'teiwa.cabal' generated by 'cabal init'.
-- For further documentation, see http://haskell.org/cabal/users-guide/
name: teiwa
version: 0.1.0.0
synopsis: TEI Word Annotator
description:
The TEI word annotator applies morpho-syntactic annotations following the
Lightweight Linguistic Annotation proposed by the TEI consortium
(https://tei-c.org/release/doc/tei-p5-doc/en/html/AI.html#AILALW) back on the
text that was annotated to achieve «in-place» annotation
homepage: https://gitlab.liris.cnrs.fr/abrenon/teiwa
-- bug-reports:
license: BSD3
license-file: LICENSE
author: Alice BRENON
maintainer: alice.brenon@ens-lyon.fr
-- copyright:
category: Text
build-type: Simple
extra-source-files: CHANGELOG.md
library
exposed-modules: Text.TEIWA
other-modules: Text.TEIWA.Annotation
, Text.TEIWA.Config
, Text.TEIWA.Error
, Text.TEIWA.Source
, Text.TEIWA.Source.Common
, Text.TEIWA.Source.ConLLX
, Text.TEIWA.Source.CSV
build-depends: base >=4.12 && <4.15
, bytestring
, mtl
, parsec
, text
hs-source-dirs: lib
default-language: Haskell2010
ghc-options: -Wall
executable teiwa
main-is: Main.hs
other-modules: CLI
, Paths_teiwa
-- other-extensions:
build-depends: base >=4.12 && <4.13
, filepath
, mtl
, optparse-applicative
, parsec
, teiwa
, text
hs-source-dirs: app
default-language: Haskell2010
ghc-options: -Wall
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment