{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} module Text.TEIWA.Source ( Format , Origin(..) , Source(..) , coNLLX , csv , runTEIWAParser , parse , tsv ) where import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (MonadReader(..), ReaderT(..)) import Control.Monad.IO.Class (MonadIO(..)) import Data.Text.Lazy as Text (Text, toLower, unpack) import Data.Text.Lazy.IO as Text (readFile) import Text.Parsec (SourceName, runParserT) import Text.TEIWA.Annotation ( Annotation(..), Attributes, SentenceAnnotation(..), TokenAnnotation(..) ) import Text.TEIWA.Config (Config(..)) import Text.TEIWA.Error (Error(..)) import Text.TEIWA.Source.Common ( AnnotationContext(..), Field, Row, TEIWAParser, annotationContext ) import qualified Text.TEIWA.Source.CoNLLX as CoNLLX (header, sentences) import qualified Text.TEIWA.Source.SSV as SSV (body, header) type Format = TEIWAParser Annotation isTEI :: Text -> Bool isTEI = (`elem` ["pos", "lemma", "msd"]) filterAttributes :: MonadReader Config m => [(Text, Field)] -> m Attributes filterAttributes attributes = reader $ (attributes >>=) . filterIf . strictTEI where filterIf _ ("", _) = [] filterIf _ (_, Nothing) = [] filterIf False (k, Just v) = [(k, v)] filterIf True (k, Just v) = let k' = Text.toLower k in if isTEI k' then [(k', v)] else [] annotateToken :: (MonadError Error m, MonadReader Config m) => AnnotationContext -> Row -> m TokenAnnotation annotateToken (AnnotationContext {columnIndex, columnName, header}) (atLine, record) = case splitAt columnIndex record of (before, (Just form):after) -> TokenAnnotation form <$> filterAttributes (zip header $ before ++ after) _ -> throwError . MissingColumn atLine $ Text.unpack columnName coNLLX :: Format coNLLX = do context <- annotationContext CoNLLX.header SentenceLevel <$> ( CoNLLX.sentences >>= mapM ( fmap SentenceAnnotation . mapM (annotateToken context) ) ) ssv :: Char -> Format ssv separator = do context <- annotationContext =<< SSV.header separator TokenLevel <$> (SSV.body separator >>= mapM (annotateToken context)) csv :: Format csv = ssv ',' tsv :: Format tsv = ssv '\t' data Origin = File FilePath | Text Text data Source = Source { format :: Format , origin :: Origin } runTEIWAParser :: MonadError Error m => Config -> TEIWAParser a -> SourceName -> Text -> m a runTEIWAParser config p s = flattenErrors . (`runReaderT` config) . runParserT p () s where flattenErrors = either throwError (either (throwError . ParsingError) pure) parse :: (MonadIO m, MonadError Error m) => Config -> Source -> m Annotation parse config (Source {format, origin}) = parseFrom format origin where parseFrom p (File f) = liftIO (Text.readFile f) >>= runTEIWAParser config p f parseFrom p (Text t) = runTEIWAParser config p "" t