#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE ExplicitNamespaces #-} import qualified Conllu.UposTagset as Conllu (POS(NOUN)) import qualified Conllu.DeprelTagset as Conllu (EP(APPOS)) import Conllu.Tree ( EP(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..), POS(..) , Rel(..) ) import Control.Applicative ((<**>), (<|>)) import Control.Monad.IO.Class (liftIO) import Control.Monad.Reader (ReaderT, asks, runReaderT) import Data.ByteString as ByteString (readFile) import Data.Maybe (listToMaybe) import Data.Serialize (decode) import Data.Tree (Tree(..)) import GEODE.Metadata ( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue , readNamedTsv, tsvLines ) import GEODE.Metadata.TSV.Header (for, getHeader) import Options.Applicative ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc , short, strOption ) import System.FilePath ((</>)) import System.Script (try, warn) data Config = Config { inputRoot :: FilePath , inputTsv :: FilePath } configParser :: Parser Config configParser = Config <$> strOption root <*> strOption inputTsvArg where root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY" <> help "path to the base directory containing the syntax annotations" inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" <> help "TSV file containing the article records to process" getConfig :: IO Config getConfig = execParser (info (configParser <**> helper) (fullDesc <> progDesc "A textometric tool to extract nouns at the top of the first sentence")) type Result = ArticleRecord @ IndexedWord profile :: ArticleRecord -> ReaderT Config IO (Maybe (WithDefaultHeader Result)) profile articleRecord = do path <- asks ((</> relativePath articleRecord "tree") . inputRoot) liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode where skipAndWarn msg = Nothing <$ warn msg analyse = pure . fmap (glue articleRecord) . searchDocument searchDocument :: IndexedDocument -> Maybe IndexedWord searchDocument (IndexedDocument {_sentences}) = listToMaybe _sentences >>= firstTopNOUN firstTopNOUN :: IndexedSentence -> Maybe IndexedWord firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop where fromTop (Node {rootLabel, subForest}) | isNoun (_upos rootLabel) = Just rootLabel | otherwise = foldl (<|>) Nothing (apposNoun <$> subForest) apposNoun (Node {rootLabel}) | isNoun (_upos rootLabel) && isAppos (_deprel <$> _rel rootLabel) = Just rootLabel | otherwise = Nothing isNoun (Just (POS Conllu.NOUN)) = True isNoun _ = False isAppos (Just (EP Conllu.APPOS)) = True isAppos _ = False main :: IO () main = getConfig >>= runReaderT chain where chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay searchAndDisplay rows = do liftIO $ tsvLines [getHeader (for :: Result)] mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows