From a9d97de545626db5b2062aebb3f06fff912da38e Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Tue, 2 Jan 2024 17:52:09 +0100
Subject: [PATCH] Improve scripts for profile computation: searching from
 (serialized) indexed trees, and using ranges to compact occurrences found

---
 lib/haskell/Conllu/Tree.hs                    |  13 ++-
 lib/python/GEODE/signal.py                    |   3 -
 scripts/textometry/computeProfile.hs          |  98 +++++++++++++++++
 ...serialiseSyntax.hs => indexSyntaxTrees.hs} |   0
 scripts/textometry/profile.hs                 | 104 ------------------
 scripts/textometry/visualiseProfile.py        |  66 ++++-------
 6 files changed, 126 insertions(+), 158 deletions(-)
 create mode 100755 scripts/textometry/computeProfile.hs
 rename scripts/textometry/{serialiseSyntax.hs => indexSyntaxTrees.hs} (100%)
 delete mode 100755 scripts/textometry/profile.hs

diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs
index 84ef895..559183d 100644
--- a/lib/haskell/Conllu/Tree.hs
+++ b/lib/haskell/Conllu/Tree.hs
@@ -14,7 +14,7 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP
 import qualified Conllu.UposTagset as Conllu (POS)
 import qualified Conllu.DeprelTagset as Conllu (EP)
 import Data.Int (Int8)
-import Data.List (foldl', partition)
+import Data.List (partition)
 import Data.Map as Map (Map, empty, insert)
 import Data.Serialize (Serialize(..))
 import Data.Tree (Forest, Tree(..))
@@ -147,9 +147,10 @@ indexSentence offset = IndexedSentence offset . build 0
     pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
 
 indexDocument :: Conllu.Doc -> IndexedDocument
-indexDocument doc =
-  IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords}
+indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc
   where
-    sentWords = Conllu._words <$> doc
-    (offsets, _total) = foldl' next ([], 0) sentWords
-    next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
+    addSentences (_total, sentencesStack) [] =
+      IndexedDocument {_total, _sentences = reverse sentencesStack}
+    addSentences (partial, sentencesStack) (sentence:others) =
+      let indexed = indexSentence partial sentence in
+      addSentences (partial + length sentence, indexed:sentencesStack) others
diff --git a/lib/python/GEODE/signal.py b/lib/python/GEODE/signal.py
index ad3bb14..978fb28 100644
--- a/lib/python/GEODE/signal.py
+++ b/lib/python/GEODE/signal.py
@@ -3,9 +3,6 @@ import math
 def curry(f):
     return lambda x: (lambda *args: f(x, *args))
 
-def gate(n, size, offset=0):
-    return [1 if i == n else 0 for i in range(offset, offset+size)]
-
 @curry
 def orientedIntersection(l, sNew, sOld):
     left = max(sNew*l[0], sOld*l[1])
diff --git a/scripts/textometry/computeProfile.hs b/scripts/textometry/computeProfile.hs
new file mode 100755
index 0000000..51cfcbb
--- /dev/null
+++ b/scripts/textometry/computeProfile.hs
@@ -0,0 +1,98 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
+
+{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
+import Conllu.Tree
+  ( Feat(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..)
+  , positions )
+import Control.Applicative ((<**>))
+import Control.Monad.IO.Class (liftIO)
+import Control.Monad.Reader (ReaderT, asks, runReaderT)
+import Data.Csv (ToNamedRecord(..))
+import Data.ByteString as ByteString (readFile)
+import Data.List (sort)
+import Data.Map as Map (lookup)
+import Data.Serialize (decode)
+import Data.Tree (Tree(..))
+import GEODE.Metadata
+  ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
+  , WithDefaultHeader(..), glue, readNamedTsv, tsvLines )
+import GEODE.Metadata.TSV.Header (for, getHeader)
+import GHC.Generics (Generic)
+import Options.Applicative
+  ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
+  , short, strOption )
+import System.FilePath ((</>))
+import System.Script (try, warn)
+
+data Config = Config
+  { inputRoot :: FilePath
+  , inputTsv :: FilePath }
+
+configParser :: Parser Config
+configParser = Config
+  <$> strOption root
+  <*> strOption inputTsvArg
+  where
+    root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
+          <> help "path to the base directory containing the syntax annotations"
+    inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
+          <> help "TSV file containing the article records to process"
+
+getConfig :: IO Config
+getConfig = execParser
+  (info
+    (configParser <**> helper)
+    (fullDesc
+    <> progDesc "A textometric tool to draw discursive profiles"))
+
+data Occurrence = Occurrence
+  { totalSize :: Int
+  , position :: Int
+  , size :: Int } deriving Generic
+
+instance ToNamedRecord Occurrence
+instance HasDefaultHeader Occurrence where
+  defaultFields = DefaultFields ["position", "size", "totalSize"]
+
+type Measure = ArticleRecord @ Occurrence
+
+profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
+profile articleRecord = do
+  path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
+  liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
+  where
+    skipAndWarn msg = [] <$ warn msg
+    analyse = pure . fmap (glue articleRecord) . findOccurrences
+
+findOccurrences :: IndexedDocument -> [Occurrence]
+findOccurrences (IndexedDocument {_total, _sentences}) =
+  groupOccurrences [] (_sentences >>= imperativeVerb)
+  --uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb)
+  where
+    groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence]
+    groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack
+    groupOccurrences [] (p:ps) = groupOccurrences [(p, 1)] ps
+    groupOccurrences (tmp@(first, tmpLength):stack) (p:ps)
+      | p == first + tmpLength =
+        groupOccurrences ((first, tmpLength + 1):stack) ps
+      | otherwise = groupOccurrences ((p, 1):tmp:stack) ps
+
+imperativeVerb :: IndexedSentence -> [Int]
+imperativeVerb (IndexedSentence {_offset, _syntax}) =
+  sort (_syntax >>= findSpans False)
+  where
+    findSpans isImperative (Node {rootLabel, subForest}) =
+      case Map.lookup "Mood" $ _feats rootLabel of
+        Just moods -> let nowImperative = _values moods == ["Imp"] in
+          consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
+        Nothing ->
+          consIf rootLabel isImperative (findSpans isImperative =<< subForest)
+    consIf a b = if b then (positions _offset a ++) else id
+
+main :: IO ()
+main = getConfig >>= runReaderT chain
+  where
+    chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
+    searchAndDisplay rows = do
+      liftIO $ tsvLines [getHeader (for :: Measure)]
+      mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
diff --git a/scripts/textometry/serialiseSyntax.hs b/scripts/textometry/indexSyntaxTrees.hs
similarity index 100%
rename from scripts/textometry/serialiseSyntax.hs
rename to scripts/textometry/indexSyntaxTrees.hs
diff --git a/scripts/textometry/profile.hs b/scripts/textometry/profile.hs
deleted file mode 100755
index 92997ba..0000000
--- a/scripts/textometry/profile.hs
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
-
-{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
-import Conllu.Parse (parseConllu)
-import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..))
-import Control.Applicative ((<**>))
-import Control.Monad.IO.Class (liftIO)
-import Control.Monad.Reader (ReaderT, asks, runReaderT)
-import Data.ByteString.Lazy as ByteString (putStr)
-import Data.Csv (ToNamedRecord(..), encodeWith)
-import Data.List (find, foldl', partition, sort)
-import Data.Tree (Forest, Tree(..))
-import GEODE.Metadata
-  ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
-  , WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines )
-import GEODE.Metadata.TSV.Header (for, getHeader)
-import GHC.Generics (Generic)
-import Options.Applicative
-  ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
-  , short, strOption )
-import System.FilePath ((</>))
-import System.Script (try, warn)
-
-data Config = Config
-  { inputRoot :: FilePath
-  , inputTsv :: FilePath }
-
-configParser :: Parser Config
-configParser = Config
-  <$> strOption root
-  <*> strOption inputTsvArg
-  where
-    root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY"
-          <> help "path to the base directory containing the syntax annotations"
-    inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
-          <> help "TSV file containing the article records to process"
-
-getConfig :: IO Config
-getConfig = execParser
-  (info
-    (configParser <**> helper)
-    (fullDesc
-    <> progDesc "A textometric tool to draw discursive profiles"))
-
-data Position = Position
-  { totalSize :: Int
-  , position :: Int } deriving Generic
-
-instance ToNamedRecord Position
-instance HasDefaultHeader Position where
-  defaultFields = DefaultFields ["position", "totalSize"]
-
-type Measure = ArticleRecord @ Position
-
-profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
-profile articleRecord = do
-  path <- asks ((</> relativePath articleRecord "conllu") . inputRoot)
-  liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path
-  where
-    skipAndWarn msg = [] <$ warn msg
-    analyse = pure . fmap (glue articleRecord) . findOccurrences
-
-findOccurrences :: Doc -> [Position]
-findOccurrences doc =
-  Position total <$> (zip (reverse offsets) doc >>= imperativeVerb)
-  where
-    (offsets, total) = foldl' next ([], 0) $ _words <$> doc
-    next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
-
-getSID :: ID -> Maybe Int
-getSID (SID n) = Just n
-getSID _ = Nothing
-
-imperativeVerb :: (Int, Sent) -> [Int]
-imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False)
-  where
-    findSpans isImperative (Node {rootLabel, subForest}) =
-      case getMoods $ _feats rootLabel of
-        Just moods -> let nowImperative = moods == ["Imp"] in
-          consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
-        Nothing ->
-          consIf rootLabel isImperative (findSpans isImperative =<< subForest)
-    consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id
-    getMoods = fmap _featValues . find (("Mood" ==) . _feat)
-
-type Syntax = Forest (CW AW)
-
-syntax :: Sent -> Syntax
-syntax = build 0 . _words
-  where
-    build n cws =
-      let (pointing, others) = partition (pointingTo n) cws in
-      recurseOn others <$> pointing
-    recurseOn cws cw = Node
-      { rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) }
-    pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw)
-
-main :: IO ()
-main = getConfig >>= runReaderT chain
-  where
-    chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
-    searchAndDisplay rows = do
-      liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)]
-      mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
diff --git a/scripts/textometry/visualiseProfile.py b/scripts/textometry/visualiseProfile.py
index 84cbeaa..54c5ef5 100755
--- a/scripts/textometry/visualiseProfile.py
+++ b/scripts/textometry/visualiseProfile.py
@@ -2,48 +2,15 @@
 
 from GEODE import fromTSV, toKey, toTSV
 from GEODE.store import prepare
-from GEODE.signal import gate, resample
+from GEODE.signal import resample
 import matplotlib.pyplot as plot
 import seaborn
 from sys import argv
 
-def plotSingle(measure):
-    return gate(measure['position'], measure['totalSize'], offset=1)
-
-histData = [0.3741792369121561, 0.7607808340727595, 0.8889086069210292,
-            0.7491570541259982, 1.2057675244010648, 1.0869565217391306,
-            1.096095829636202, 0.7124223602484473, 0.7251109139307897,
-            0.9335403726708075, 1.2277728482697428, 0.9582963620230699,
-            0.8540372670807452, 1.0281277728482696, 1.4221827861579415,
-            0.9218278615794143, 0.46814551907719604, 0.7717834960070986,
-            1.083762200532387, 0.65226264418811, 0.5771073646850046,
-            0.8025732031943212, 0.5266193433895296, 0.8911268855368234,
-            0.6836734693877551, 0.9039041703637977, 0.8720496894409939,
-            0.7113575865128662, 0.8984028393966283, 0.8993788819875776,
-            1.0016858917480034, 0.5857142857142857, 0.7364685004436559,
-            0.8047914818101152, 0.7055900621118011, 0.9018633540372669,
-            0.944010647737356, 0.9955634427684119, 1.0425909494232473,
-            0.9046140195208519, 0.8504880212954751, 1.1251109139307898,
-            0.44631765749778174, 0.49893522626441883, 0.6860692102928126,
-            0.7024844720496894, 0.4693877551020407, 1.5570541259982251,
-            0.8903283052351374, 0.6923691215616682, 0.8062999112688553,
-            1.0178349600709848, 0.5559006211180125, 0.7621118012422359,
-            0.848447204968944, 0.5782608695652174, 0.8464063886424137,
-            0.5537710736468501, 0.7160603371783496, 0.7982253771073646,
-            0.8371783496007098, 0.9143744454303461, 1.0799467613132205,
-            0.9581188997338067, 0.8597160603371785, 0.864685004436557,
-            1.2598935226264418, 1.3385093167701863, 0.45891748003549254,
-            0.9355811889973382, 0.6289263531499556, 0.7637089618456078,
-            0.7324755989352264, 0.754924578527063, 0.568589174800355,
-            0.49778172138420584, 0.7707187222715175, 1.0097604259094939,
-            0.8621118012422362, 0.8971606033717835, 1.1584738243123336,
-            1.1568766637089618, 0.7698314108251997, 0.9032830523513753,
-            0.5743566992014197, 0.8896184560780832, 0.7858917480035492,
-            0.9899733806566103, 1.617657497781721, 1.066725820763088,
-            0.6067435669920143, 1.1874001774622889, 1.0669920141969833,
-            1.1996450754214731, 1.4835847382431233, 1.6580301685891752,
-            2.2103815439219168, 2.4215616681455185, 2.7979591836734694,
-            5.278970718722271]
+def gate(measure):
+    first, last = measure['position'], measure['position'] + measure['size']
+    return [1 if i >= first and i < last else 0
+            for i in range(1, 1 + measure['totalSize'])]
 
 def plotProfile(profile, outputPath):
     plot.figure(figsize=(16,13))
@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath):
     ax.fill_between(x, y, alpha=0.3)
     plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight')
 
+def sumProfiles(sameSizeProfiles):
+    return list(map(sum, zip(*sameSizeProfiles)))
+
 def computeProfile(measures, resolution=100):
-    profile = [0]*resolution
-    for resampled in map(resample(resolution), map(plotSingle, measures)):
-        profile = list(map(sum, zip(profile, resampled)))
-    return [100*x/len(measures) for x in profile]
+    bySize, count = {}, 0
+    for measure in measures:
+        distribution = gate(measure)
+        count += measure['size']
+        l = len(distribution)
+        if l not in bySize:
+            bySize[l] = []
+        bySize[l].append(distribution)
+    resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
+    return [100*x/count for x in sumProfiles(list(resampled))]
 
-def drawProfile(measures, outputPath):
+def visualiseProfile(measures, outputPath):
     profile = computeProfile(measures)
     toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None)
     plotProfile(profile, f"{outputPath}/profile.png")
 
 if __name__ == '__main__':
-    drawProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], argv[2])
-    #plotProfile(histData, argv[2])
+    visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()],
+                     argv[2])
-- 
GitLab