From 5806cf119c8b94fa4806dc8598238150368f154a Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Fri, 29 Dec 2023 12:23:37 +0100 Subject: [PATCH] Add a script to extract the repartition of verbs with the imperative mood within a corpus --- scripts/textometry/profile.hs | 104 ++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100755 scripts/textometry/profile.hs diff --git a/scripts/textometry/profile.hs b/scripts/textometry/profile.hs new file mode 100755 index 0000000..f9fc6bd --- /dev/null +++ b/scripts/textometry/profile.hs @@ -0,0 +1,104 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" + +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} +import Conllu.Parse (parseConllu) +import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..)) +import Control.Applicative ((<**>)) +import Data.ByteString.Lazy as ByteString (putStr) +import Data.Csv (ToNamedRecord(..), encodeWith) +import Data.List (find, foldl', partition, sort) +import Data.Tree (Forest, Tree(..)) +import GEODE.Metadata + ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) + , WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines ) +import GEODE.Metadata.TSV.Header (for, getHeader) +import GHC.Generics (Generic) +import Options.Applicative + ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc + , short, strOption ) +import System.FilePath ((</>)) +import System.Script (try, warn) + +data Config = Config + { inputRoot :: FilePath + , inputTsv :: FilePath } + +configParser :: Parser Config +configParser = Config + <$> strOption root + <*> strOption inputTsvArg + where + root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY" + <> help "path to the base directory containing the syntax annotations" + inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" + <> help "TSV file containing the article records to process" + +getConfig :: IO Config +getConfig = execParser + (info + (configParser <**> helper) + (fullDesc + <> progDesc "A textometric tool to draw discursive profiles")) + +data Position = Position + { totalSize :: Int + , position :: Int } deriving Generic + +instance ToNamedRecord Position +instance HasDefaultHeader Position where + defaultFields = DefaultFields ["position", "totalSize"] + +type Measure = ArticleRecord @ Position + +profile :: Config -> ArticleRecord -> IO [WithDefaultHeader Measure] +profile (Config {inputRoot}) articleRecord = + readFile inputPath >>= either skipAndWarn analyse . parseConllu inputPath + where + inputPath = inputRoot </> relativePath articleRecord "conllu" + skipAndWarn msg = [] <$ warn msg + analyse = pure . fmap (glue articleRecord) . findOccurrences + +findOccurrences :: Doc -> [Position] +findOccurrences doc = + Position total <$> (zip (reverse offsets) doc >>= imperativeVerb) + where + (offsets, total) = foldl' next ([], 0) $ _words <$> doc + next (tmpOffsets, tmpTotal) sentence = (tmpTotal:tmpOffsets, tmpTotal + length sentence) + +getSID :: ID -> Maybe Int +getSID (SID n) = Just n +getSID _ = Nothing + +imperativeVerb :: (Int, Sent) -> [Int] +imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False) + where + findSpans isImperative (Node {rootLabel, subForest}) = + case getMoods $ _feats rootLabel of + Just moods -> let nowImperative = moods == ["Imp"] in + consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) + Nothing -> + consIf rootLabel isImperative (findSpans isImperative =<< subForest) + consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id + getMoods = fmap _featValues . find (("Mood" ==) . _feat) + +type Syntax = Forest (CW AW) + +syntax :: Sent -> Syntax +syntax = build 0 . _words + where + build n cws = + let (pointing, others) = partition (pointingTo n) cws in + recurseOn others <$> pointing + recurseOn cws cw = Node + { rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) } + pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw) + +main :: IO () +main = do + config@(Config {inputTsv}) <- getConfig + try (readNamedTsv inputTsv) + >>= searchAndDisplay config + where + searchAndDisplay config rows = do + ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)] + mapM_ (\ar -> profile config ar >>= tsvLines) rows -- GitLab