From a9d97de545626db5b2062aebb3f06fff912da38e Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Tue, 2 Jan 2024 17:52:09 +0100 Subject: [PATCH] Improve scripts for profile computation: searching from (serialized) indexed trees, and using ranges to compact occurrences found --- lib/haskell/Conllu/Tree.hs | 13 ++- lib/python/GEODE/signal.py | 3 - scripts/textometry/computeProfile.hs | 98 +++++++++++++++++ ...serialiseSyntax.hs => indexSyntaxTrees.hs} | 0 scripts/textometry/profile.hs | 104 ------------------ scripts/textometry/visualiseProfile.py | 66 ++++------- 6 files changed, 126 insertions(+), 158 deletions(-) create mode 100755 scripts/textometry/computeProfile.hs rename scripts/textometry/{serialiseSyntax.hs => indexSyntaxTrees.hs} (100%) delete mode 100755 scripts/textometry/profile.hs diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs index 84ef895..559183d 100644 --- a/lib/haskell/Conllu/Tree.hs +++ b/lib/haskell/Conllu/Tree.hs @@ -14,7 +14,7 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.DeprelTagset as Conllu (EP) import Data.Int (Int8) -import Data.List (foldl', partition) +import Data.List (partition) import Data.Map as Map (Map, empty, insert) import Data.Serialize (Serialize(..)) import Data.Tree (Forest, Tree(..)) @@ -147,9 +147,10 @@ indexSentence offset = IndexedSentence offset . build 0 pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) indexDocument :: Conllu.Doc -> IndexedDocument -indexDocument doc = - IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords} +indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc where - sentWords = Conllu._words <$> doc - (offsets, _total) = foldl' next ([], 0) sentWords - next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) + addSentences (_total, sentencesStack) [] = + IndexedDocument {_total, _sentences = reverse sentencesStack} + addSentences (partial, sentencesStack) (sentence:others) = + let indexed = indexSentence partial sentence in + addSentences (partial + length sentence, indexed:sentencesStack) others diff --git a/lib/python/GEODE/signal.py b/lib/python/GEODE/signal.py index ad3bb14..978fb28 100644 --- a/lib/python/GEODE/signal.py +++ b/lib/python/GEODE/signal.py @@ -3,9 +3,6 @@ import math def curry(f): return lambda x: (lambda *args: f(x, *args)) -def gate(n, size, offset=0): - return [1 if i == n else 0 for i in range(offset, offset+size)] - @curry def orientedIntersection(l, sNew, sOld): left = max(sNew*l[0], sOld*l[1]) diff --git a/scripts/textometry/computeProfile.hs b/scripts/textometry/computeProfile.hs new file mode 100755 index 0000000..51cfcbb --- /dev/null +++ b/scripts/textometry/computeProfile.hs @@ -0,0 +1,98 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" + +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} +import Conllu.Tree + ( Feat(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..) + , positions ) +import Control.Applicative ((<**>)) +import Control.Monad.IO.Class (liftIO) +import Control.Monad.Reader (ReaderT, asks, runReaderT) +import Data.Csv (ToNamedRecord(..)) +import Data.ByteString as ByteString (readFile) +import Data.List (sort) +import Data.Map as Map (lookup) +import Data.Serialize (decode) +import Data.Tree (Tree(..)) +import GEODE.Metadata + ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) + , WithDefaultHeader(..), glue, readNamedTsv, tsvLines ) +import GEODE.Metadata.TSV.Header (for, getHeader) +import GHC.Generics (Generic) +import Options.Applicative + ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc + , short, strOption ) +import System.FilePath ((</>)) +import System.Script (try, warn) + +data Config = Config + { inputRoot :: FilePath + , inputTsv :: FilePath } + +configParser :: Parser Config +configParser = Config + <$> strOption root + <*> strOption inputTsvArg + where + root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY" + <> help "path to the base directory containing the syntax annotations" + inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" + <> help "TSV file containing the article records to process" + +getConfig :: IO Config +getConfig = execParser + (info + (configParser <**> helper) + (fullDesc + <> progDesc "A textometric tool to draw discursive profiles")) + +data Occurrence = Occurrence + { totalSize :: Int + , position :: Int + , size :: Int } deriving Generic + +instance ToNamedRecord Occurrence +instance HasDefaultHeader Occurrence where + defaultFields = DefaultFields ["position", "size", "totalSize"] + +type Measure = ArticleRecord @ Occurrence + +profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure] +profile articleRecord = do + path <- asks ((</> relativePath articleRecord "tree") . inputRoot) + liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode + where + skipAndWarn msg = [] <$ warn msg + analyse = pure . fmap (glue articleRecord) . findOccurrences + +findOccurrences :: IndexedDocument -> [Occurrence] +findOccurrences (IndexedDocument {_total, _sentences}) = + groupOccurrences [] (_sentences >>= imperativeVerb) + --uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb) + where + groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence] + groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack + groupOccurrences [] (p:ps) = groupOccurrences [(p, 1)] ps + groupOccurrences (tmp@(first, tmpLength):stack) (p:ps) + | p == first + tmpLength = + groupOccurrences ((first, tmpLength + 1):stack) ps + | otherwise = groupOccurrences ((p, 1):tmp:stack) ps + +imperativeVerb :: IndexedSentence -> [Int] +imperativeVerb (IndexedSentence {_offset, _syntax}) = + sort (_syntax >>= findSpans False) + where + findSpans isImperative (Node {rootLabel, subForest}) = + case Map.lookup "Mood" $ _feats rootLabel of + Just moods -> let nowImperative = _values moods == ["Imp"] in + consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) + Nothing -> + consIf rootLabel isImperative (findSpans isImperative =<< subForest) + consIf a b = if b then (positions _offset a ++) else id + +main :: IO () +main = getConfig >>= runReaderT chain + where + chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay + searchAndDisplay rows = do + liftIO $ tsvLines [getHeader (for :: Measure)] + mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows diff --git a/scripts/textometry/serialiseSyntax.hs b/scripts/textometry/indexSyntaxTrees.hs similarity index 100% rename from scripts/textometry/serialiseSyntax.hs rename to scripts/textometry/indexSyntaxTrees.hs diff --git a/scripts/textometry/profile.hs b/scripts/textometry/profile.hs deleted file mode 100755 index 92997ba..0000000 --- a/scripts/textometry/profile.hs +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" - -{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} -import Conllu.Parse (parseConllu) -import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..)) -import Control.Applicative ((<**>)) -import Control.Monad.IO.Class (liftIO) -import Control.Monad.Reader (ReaderT, asks, runReaderT) -import Data.ByteString.Lazy as ByteString (putStr) -import Data.Csv (ToNamedRecord(..), encodeWith) -import Data.List (find, foldl', partition, sort) -import Data.Tree (Forest, Tree(..)) -import GEODE.Metadata - ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) - , WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines ) -import GEODE.Metadata.TSV.Header (for, getHeader) -import GHC.Generics (Generic) -import Options.Applicative - ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc - , short, strOption ) -import System.FilePath ((</>)) -import System.Script (try, warn) - -data Config = Config - { inputRoot :: FilePath - , inputTsv :: FilePath } - -configParser :: Parser Config -configParser = Config - <$> strOption root - <*> strOption inputTsvArg - where - root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY" - <> help "path to the base directory containing the syntax annotations" - inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" - <> help "TSV file containing the article records to process" - -getConfig :: IO Config -getConfig = execParser - (info - (configParser <**> helper) - (fullDesc - <> progDesc "A textometric tool to draw discursive profiles")) - -data Position = Position - { totalSize :: Int - , position :: Int } deriving Generic - -instance ToNamedRecord Position -instance HasDefaultHeader Position where - defaultFields = DefaultFields ["position", "totalSize"] - -type Measure = ArticleRecord @ Position - -profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure] -profile articleRecord = do - path <- asks ((</> relativePath articleRecord "conllu") . inputRoot) - liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path - where - skipAndWarn msg = [] <$ warn msg - analyse = pure . fmap (glue articleRecord) . findOccurrences - -findOccurrences :: Doc -> [Position] -findOccurrences doc = - Position total <$> (zip (reverse offsets) doc >>= imperativeVerb) - where - (offsets, total) = foldl' next ([], 0) $ _words <$> doc - next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) - -getSID :: ID -> Maybe Int -getSID (SID n) = Just n -getSID _ = Nothing - -imperativeVerb :: (Int, Sent) -> [Int] -imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False) - where - findSpans isImperative (Node {rootLabel, subForest}) = - case getMoods $ _feats rootLabel of - Just moods -> let nowImperative = moods == ["Imp"] in - consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) - Nothing -> - consIf rootLabel isImperative (findSpans isImperative =<< subForest) - consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id - getMoods = fmap _featValues . find (("Mood" ==) . _feat) - -type Syntax = Forest (CW AW) - -syntax :: Sent -> Syntax -syntax = build 0 . _words - where - build n cws = - let (pointing, others) = partition (pointingTo n) cws in - recurseOn others <$> pointing - recurseOn cws cw = Node - { rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) } - pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw) - -main :: IO () -main = getConfig >>= runReaderT chain - where - chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay - searchAndDisplay rows = do - liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)] - mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows diff --git a/scripts/textometry/visualiseProfile.py b/scripts/textometry/visualiseProfile.py index 84cbeaa..54c5ef5 100755 --- a/scripts/textometry/visualiseProfile.py +++ b/scripts/textometry/visualiseProfile.py @@ -2,48 +2,15 @@ from GEODE import fromTSV, toKey, toTSV from GEODE.store import prepare -from GEODE.signal import gate, resample +from GEODE.signal import resample import matplotlib.pyplot as plot import seaborn from sys import argv -def plotSingle(measure): - return gate(measure['position'], measure['totalSize'], offset=1) - -histData = [0.3741792369121561, 0.7607808340727595, 0.8889086069210292, - 0.7491570541259982, 1.2057675244010648, 1.0869565217391306, - 1.096095829636202, 0.7124223602484473, 0.7251109139307897, - 0.9335403726708075, 1.2277728482697428, 0.9582963620230699, - 0.8540372670807452, 1.0281277728482696, 1.4221827861579415, - 0.9218278615794143, 0.46814551907719604, 0.7717834960070986, - 1.083762200532387, 0.65226264418811, 0.5771073646850046, - 0.8025732031943212, 0.5266193433895296, 0.8911268855368234, - 0.6836734693877551, 0.9039041703637977, 0.8720496894409939, - 0.7113575865128662, 0.8984028393966283, 0.8993788819875776, - 1.0016858917480034, 0.5857142857142857, 0.7364685004436559, - 0.8047914818101152, 0.7055900621118011, 0.9018633540372669, - 0.944010647737356, 0.9955634427684119, 1.0425909494232473, - 0.9046140195208519, 0.8504880212954751, 1.1251109139307898, - 0.44631765749778174, 0.49893522626441883, 0.6860692102928126, - 0.7024844720496894, 0.4693877551020407, 1.5570541259982251, - 0.8903283052351374, 0.6923691215616682, 0.8062999112688553, - 1.0178349600709848, 0.5559006211180125, 0.7621118012422359, - 0.848447204968944, 0.5782608695652174, 0.8464063886424137, - 0.5537710736468501, 0.7160603371783496, 0.7982253771073646, - 0.8371783496007098, 0.9143744454303461, 1.0799467613132205, - 0.9581188997338067, 0.8597160603371785, 0.864685004436557, - 1.2598935226264418, 1.3385093167701863, 0.45891748003549254, - 0.9355811889973382, 0.6289263531499556, 0.7637089618456078, - 0.7324755989352264, 0.754924578527063, 0.568589174800355, - 0.49778172138420584, 0.7707187222715175, 1.0097604259094939, - 0.8621118012422362, 0.8971606033717835, 1.1584738243123336, - 1.1568766637089618, 0.7698314108251997, 0.9032830523513753, - 0.5743566992014197, 0.8896184560780832, 0.7858917480035492, - 0.9899733806566103, 1.617657497781721, 1.066725820763088, - 0.6067435669920143, 1.1874001774622889, 1.0669920141969833, - 1.1996450754214731, 1.4835847382431233, 1.6580301685891752, - 2.2103815439219168, 2.4215616681455185, 2.7979591836734694, - 5.278970718722271] +def gate(measure): + first, last = measure['position'], measure['position'] + measure['size'] + return [1 if i >= first and i < last else 0 + for i in range(1, 1 + measure['totalSize'])] def plotProfile(profile, outputPath): plot.figure(figsize=(16,13)) @@ -59,17 +26,26 @@ def plotProfile(profile, outputPath): ax.fill_between(x, y, alpha=0.3) plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight') +def sumProfiles(sameSizeProfiles): + return list(map(sum, zip(*sameSizeProfiles))) + def computeProfile(measures, resolution=100): - profile = [0]*resolution - for resampled in map(resample(resolution), map(plotSingle, measures)): - profile = list(map(sum, zip(profile, resampled))) - return [100*x/len(measures) for x in profile] + bySize, count = {}, 0 + for measure in measures: + distribution = gate(measure) + count += measure['size'] + l = len(distribution) + if l not in bySize: + bySize[l] = [] + bySize[l].append(distribution) + resampled = map(resample(resolution), map(sumProfiles, bySize.values())) + return [100*x/count for x in sumProfiles(list(resampled))] -def drawProfile(measures, outputPath): +def visualiseProfile(measures, outputPath): profile = computeProfile(measures) toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None) plotProfile(profile, f"{outputPath}/profile.png") if __name__ == '__main__': - drawProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], argv[2]) - #plotProfile(histData, argv[2]) + visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], + argv[2]) -- GitLab