#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} import Conllu.Parse (parseConllu) import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..)) import Control.Applicative ((<**>)) import Control.Monad.IO.Class (liftIO) import Control.Monad.Reader (ReaderT, asks, runReaderT) import Data.ByteString.Lazy as ByteString (putStr) import Data.Csv (ToNamedRecord(..), encodeWith) import Data.List (find, foldl', partition, sort) import Data.Tree (Forest, Tree(..)) import GEODE.Metadata ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) , WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines ) import GEODE.Metadata.TSV.Header (for, getHeader) import GHC.Generics (Generic) import Options.Applicative ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc , short, strOption ) import System.FilePath ((</>)) import System.Script (try, warn) data Config = Config { inputRoot :: FilePath , inputTsv :: FilePath } configParser :: Parser Config configParser = Config <$> strOption root <*> strOption inputTsvArg where root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY" <> help "path to the base directory containing the syntax annotations" inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" <> help "TSV file containing the article records to process" getConfig :: IO Config getConfig = execParser (info (configParser <**> helper) (fullDesc <> progDesc "A textometric tool to draw discursive profiles")) data Position = Position { totalSize :: Int , position :: Int } deriving Generic instance ToNamedRecord Position instance HasDefaultHeader Position where defaultFields = DefaultFields ["position", "totalSize"] type Measure = ArticleRecord @ Position profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure] profile articleRecord = do path <- asks ((</> relativePath articleRecord "conllu") . inputRoot) liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path where skipAndWarn msg = [] <$ warn msg analyse = pure . fmap (glue articleRecord) . findOccurrences findOccurrences :: Doc -> [Position] findOccurrences doc = Position total <$> (zip (reverse offsets) doc >>= imperativeVerb) where (offsets, total) = foldl' next ([], 0) $ _words <$> doc next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) getSID :: ID -> Maybe Int getSID (SID n) = Just n getSID _ = Nothing imperativeVerb :: (Int, Sent) -> [Int] imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False) where findSpans isImperative (Node {rootLabel, subForest}) = case getMoods $ _feats rootLabel of Just moods -> let nowImperative = moods == ["Imp"] in consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) Nothing -> consIf rootLabel isImperative (findSpans isImperative =<< subForest) consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id getMoods = fmap _featValues . find (("Mood" ==) . _feat) type Syntax = Forest (CW AW) syntax :: Sent -> Syntax syntax = build 0 . _words where build n cws = let (pointing, others) = partition (pointingTo n) cws in recurseOn others <$> pointing recurseOn cws cw = Node { rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) } pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw) main :: IO () main = getConfig >>= runReaderT chain where chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay searchAndDisplay rows = do liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)] mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows