Skip to content
Snippets Groups Projects
Commit a9d97de5 authored by Alice Brenon's avatar Alice Brenon
Browse files

Improve scripts for profile computation: searching from (serialized) indexed...

Improve scripts for profile computation: searching from (serialized) indexed trees, and using ranges to compact occurrences found
parent b861c1a2
No related branches found
No related tags found
No related merge requests found
...@@ -14,7 +14,7 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP ...@@ -14,7 +14,7 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP
import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP) import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.Int (Int8) import Data.Int (Int8)
import Data.List (foldl', partition) import Data.List (partition)
import Data.Map as Map (Map, empty, insert) import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..)) import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..)) import Data.Tree (Forest, Tree(..))
...@@ -147,9 +147,10 @@ indexSentence offset = IndexedSentence offset . build 0 ...@@ -147,9 +147,10 @@ indexSentence offset = IndexedSentence offset . build 0
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc = indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc
IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords}
where where
sentWords = Conllu._words <$> doc addSentences (_total, sentencesStack) [] =
(offsets, _total) = foldl' next ([], 0) sentWords IndexedDocument {_total, _sentences = reverse sentencesStack}
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) addSentences (partial, sentencesStack) (sentence:others) =
let indexed = indexSentence partial sentence in
addSentences (partial + length sentence, indexed:sentencesStack) others
...@@ -3,9 +3,6 @@ import math ...@@ -3,9 +3,6 @@ import math
def curry(f): def curry(f):
return lambda x: (lambda *args: f(x, *args)) return lambda x: (lambda *args: f(x, *args))
def gate(n, size, offset=0):
return [1 if i == n else 0 for i in range(offset, offset+size)]
@curry @curry
def orientedIntersection(l, sNew, sOld): def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1]) left = max(sNew*l[0], sOld*l[1])
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} {-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
import Conllu.Parse (parseConllu) import Conllu.Tree
import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..)) ( Feat(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..)
, positions )
import Control.Applicative ((<**>)) import Control.Applicative ((<**>))
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT) import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString.Lazy as ByteString (putStr) import Data.Csv (ToNamedRecord(..))
import Data.Csv (ToNamedRecord(..), encodeWith) import Data.ByteString as ByteString (readFile)
import Data.List (find, foldl', partition, sort) import Data.List (sort)
import Data.Tree (Forest, Tree(..)) import Data.Map as Map (lookup)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata import GEODE.Metadata
( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
, WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines ) , WithDefaultHeader(..), glue, readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader) import GEODE.Metadata.TSV.Header (for, getHeader)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Options.Applicative import Options.Applicative
...@@ -30,7 +33,7 @@ configParser = Config ...@@ -30,7 +33,7 @@ configParser = Config
<$> strOption root <$> strOption root
<*> strOption inputTsvArg <*> strOption inputTsvArg
where where
root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY" root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
<> help "path to the base directory containing the syntax annotations" <> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process" <> help "TSV file containing the article records to process"
...@@ -42,63 +45,54 @@ getConfig = execParser ...@@ -42,63 +45,54 @@ getConfig = execParser
(fullDesc (fullDesc
<> progDesc "A textometric tool to draw discursive profiles")) <> progDesc "A textometric tool to draw discursive profiles"))
data Position = Position data Occurrence = Occurrence
{ totalSize :: Int { totalSize :: Int
, position :: Int } deriving Generic , position :: Int
, size :: Int } deriving Generic
instance ToNamedRecord Position instance ToNamedRecord Occurrence
instance HasDefaultHeader Position where instance HasDefaultHeader Occurrence where
defaultFields = DefaultFields ["position", "totalSize"] defaultFields = DefaultFields ["position", "size", "totalSize"]
type Measure = ArticleRecord @ Position type Measure = ArticleRecord @ Occurrence
profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure] profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
profile articleRecord = do profile articleRecord = do
path <- asks ((</> relativePath articleRecord "conllu") . inputRoot) path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
where where
skipAndWarn msg = [] <$ warn msg skipAndWarn msg = [] <$ warn msg
analyse = pure . fmap (glue articleRecord) . findOccurrences analyse = pure . fmap (glue articleRecord) . findOccurrences
findOccurrences :: Doc -> [Position] findOccurrences :: IndexedDocument -> [Occurrence]
findOccurrences doc = findOccurrences (IndexedDocument {_total, _sentences}) =
Position total <$> (zip (reverse offsets) doc >>= imperativeVerb) groupOccurrences [] (_sentences >>= imperativeVerb)
--uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb)
where where
(offsets, total) = foldl' next ([], 0) $ _words <$> doc groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence]
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack
groupOccurrences [] (p:ps) = groupOccurrences [(p, 1)] ps
groupOccurrences (tmp@(first, tmpLength):stack) (p:ps)
| p == first + tmpLength =
groupOccurrences ((first, tmpLength + 1):stack) ps
| otherwise = groupOccurrences ((p, 1):tmp:stack) ps
getSID :: ID -> Maybe Int imperativeVerb :: IndexedSentence -> [Int]
getSID (SID n) = Just n imperativeVerb (IndexedSentence {_offset, _syntax}) =
getSID _ = Nothing sort (_syntax >>= findSpans False)
imperativeVerb :: (Int, Sent) -> [Int]
imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False)
where where
findSpans isImperative (Node {rootLabel, subForest}) = findSpans isImperative (Node {rootLabel, subForest}) =
case getMoods $ _feats rootLabel of case Map.lookup "Mood" $ _feats rootLabel of
Just moods -> let nowImperative = moods == ["Imp"] in Just moods -> let nowImperative = _values moods == ["Imp"] in
consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
Nothing -> Nothing ->
consIf rootLabel isImperative (findSpans isImperative =<< subForest) consIf rootLabel isImperative (findSpans isImperative =<< subForest)
consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id consIf a b = if b then (positions _offset a ++) else id
getMoods = fmap _featValues . find (("Mood" ==) . _feat)
type Syntax = Forest (CW AW)
syntax :: Sent -> Syntax
syntax = build 0 . _words
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) }
pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw)
main :: IO () main :: IO ()
main = getConfig >>= runReaderT chain main = getConfig >>= runReaderT chain
where where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do searchAndDisplay rows = do
liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)] liftIO $ tsvLines [getHeader (for :: Measure)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
...@@ -2,48 +2,15 @@ ...@@ -2,48 +2,15 @@
from GEODE import fromTSV, toKey, toTSV from GEODE import fromTSV, toKey, toTSV
from GEODE.store import prepare from GEODE.store import prepare
from GEODE.signal import gate, resample from GEODE.signal import resample
import matplotlib.pyplot as plot import matplotlib.pyplot as plot
import seaborn import seaborn
from sys import argv from sys import argv
def plotSingle(measure): def gate(measure):
return gate(measure['position'], measure['totalSize'], offset=1) first, last = measure['position'], measure['position'] + measure['size']
return [1 if i >= first and i < last else 0
histData = [0.3741792369121561, 0.7607808340727595, 0.8889086069210292, for i in range(1, 1 + measure['totalSize'])]
0.7491570541259982, 1.2057675244010648, 1.0869565217391306,
1.096095829636202, 0.7124223602484473, 0.7251109139307897,
0.9335403726708075, 1.2277728482697428, 0.9582963620230699,
0.8540372670807452, 1.0281277728482696, 1.4221827861579415,
0.9218278615794143, 0.46814551907719604, 0.7717834960070986,
1.083762200532387, 0.65226264418811, 0.5771073646850046,
0.8025732031943212, 0.5266193433895296, 0.8911268855368234,
0.6836734693877551, 0.9039041703637977, 0.8720496894409939,
0.7113575865128662, 0.8984028393966283, 0.8993788819875776,
1.0016858917480034, 0.5857142857142857, 0.7364685004436559,
0.8047914818101152, 0.7055900621118011, 0.9018633540372669,
0.944010647737356, 0.9955634427684119, 1.0425909494232473,
0.9046140195208519, 0.8504880212954751, 1.1251109139307898,
0.44631765749778174, 0.49893522626441883, 0.6860692102928126,
0.7024844720496894, 0.4693877551020407, 1.5570541259982251,
0.8903283052351374, 0.6923691215616682, 0.8062999112688553,
1.0178349600709848, 0.5559006211180125, 0.7621118012422359,
0.848447204968944, 0.5782608695652174, 0.8464063886424137,
0.5537710736468501, 0.7160603371783496, 0.7982253771073646,
0.8371783496007098, 0.9143744454303461, 1.0799467613132205,
0.9581188997338067, 0.8597160603371785, 0.864685004436557,
1.2598935226264418, 1.3385093167701863, 0.45891748003549254,
0.9355811889973382, 0.6289263531499556, 0.7637089618456078,
0.7324755989352264, 0.754924578527063, 0.568589174800355,
0.49778172138420584, 0.7707187222715175, 1.0097604259094939,
0.8621118012422362, 0.8971606033717835, 1.1584738243123336,
1.1568766637089618, 0.7698314108251997, 0.9032830523513753,
0.5743566992014197, 0.8896184560780832, 0.7858917480035492,
0.9899733806566103, 1.617657497781721, 1.066725820763088,
0.6067435669920143, 1.1874001774622889, 1.0669920141969833,
1.1996450754214731, 1.4835847382431233, 1.6580301685891752,
2.2103815439219168, 2.4215616681455185, 2.7979591836734694,
5.278970718722271]
def plotProfile(profile, outputPath): def plotProfile(profile, outputPath):
plot.figure(figsize=(16,13)) plot.figure(figsize=(16,13))
...@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath): ...@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath):
ax.fill_between(x, y, alpha=0.3) ax.fill_between(x, y, alpha=0.3)
plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight') plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight')
def sumProfiles(sameSizeProfiles):
return list(map(sum, zip(*sameSizeProfiles)))
def computeProfile(measures, resolution=100): def computeProfile(measures, resolution=100):
profile = [0]*resolution bySize, count = {}, 0
for resampled in map(resample(resolution), map(plotSingle, measures)): for measure in measures:
profile = list(map(sum, zip(profile, resampled))) distribution = gate(measure)
return [100*x/len(measures) for x in profile] count += measure['size']
l = len(distribution)
if l not in bySize:
bySize[l] = []
bySize[l].append(distribution)
resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
return [100*x/count for x in sumProfiles(list(resampled))]
def drawProfile(measures, outputPath): def visualiseProfile(measures, outputPath):
profile = computeProfile(measures) profile = computeProfile(measures)
toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None) toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None)
plotProfile(profile, f"{outputPath}/profile.png") plotProfile(profile, f"{outputPath}/profile.png")
if __name__ == '__main__': if __name__ == '__main__':
drawProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], argv[2]) visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()],
#plotProfile(histData, argv[2]) argv[2])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment