Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/outillage
1 result
Show changes
Commits on Source (3)
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
module Conllu.Tree
( Syntax
, syntax ) where
import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
( Feat(..)
, ID(..)
, IndexedDocument(..)
, IndexedSentence(..)
, IndexedWord(..)
, indexDocument
, indexSentence
, indexWord
, positions ) where
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.Int (Int8)
import Data.List (partition)
import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..))
......@@ -20,13 +28,16 @@ data ID =
instance Serialize ID
enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum
newtype POS = POS Conllu.POS deriving Show
instance Generic POS where
type Rep POS = Rec0 Int
type Rep POS = Rec0 Int8
from (POS c) = K1 (fromEnum c)
from (POS c) = K1 (enumCast c)
to (K1 i) = POS (toEnum i)
to (K1 i) = POS (enumCast i)
instance Serialize POS
......@@ -38,11 +49,11 @@ instance Serialize Feat
newtype EP = EP Conllu.EP deriving Show
instance Generic EP where
type Rep EP = Rec0 Int
type Rep EP = Rec0 Int8
from (EP c) = K1 (fromEnum c)
from (EP c) = K1 (enumCast c)
to (K1 i) = EP (toEnum i)
to (K1 i) = EP (enumCast i)
instance Serialize EP
......@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord
instance Serialize IndexedWord
type Syntax = Forest IndexedWord
data IndexedSentence = IndexedSentence
{ _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic)
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
instance Serialize IndexedSentence
data IndexedDocument = IndexedDocument
{ _total :: Int
, _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument
idOfConllu :: Conllu.ID -> ID
idOfConllu (Conllu.SID i) = SID i
idOfConllu (Conllu.MID i j) = MID i j
idOfConllu (Conllu.EID i j) = EID i j
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int]
positions offset = fmap (offset +) . getIDs . _id
where
getIDs (SID i) = [i]
getIDs (MID i j) = [i..j]
getIDs _ = []
featsOfConllu :: [Conllu.Feat] -> FEATS
featsOfConllu = foldr indexFeat Map.empty
where
......@@ -107,13 +135,22 @@ indexWord cw = IndexedWord
, _deps = relOfConllu <$> Conllu._deps cw
, _misc = Conllu._misc cw }
syntax :: Conllu.Sent -> Syntax
syntax = build 0 . Conllu._words
indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
indexSentence offset = IndexedSentence offset . build 0
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
{ rootLabel = indexWord cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc
where
addSentences (_total, sentencesStack) [] =
IndexedDocument {_total, _sentences = reverse sentencesStack}
addSentences (partial, sentencesStack) (sentence:others) =
let indexed = indexSentence partial sentence in
addSentences (partial + length sentence, indexed:sentencesStack) others
......@@ -3,9 +3,6 @@ import math
def curry(f):
return lambda x: (lambda *args: f(x, *args))
def gate(n, size, offset=0):
return [1 if i == n else 0 for i in range(offset, offset+size)]
@curry
def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1])
......
......@@ -6,6 +6,7 @@
((gnu packages haskell) #:select (ghc))
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-cereal
ghc-hs-conllu
ghc-random
ghc-regex-tdfa))
......@@ -30,6 +31,7 @@
ghc ; running haskell
ghc-aeson ; working with JSON in haskell
ghc-cassava ; working with CSV in haskell
ghc-cereal ; serializing haskell objects to/from disk
ghc-geode ; handling corpus files
ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
import Conllu.Parse (parseConllu)
import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..))
import Conllu.Tree
( Feat(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..)
, positions )
import Control.Applicative ((<**>))
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString.Lazy as ByteString (putStr)
import Data.Csv (ToNamedRecord(..), encodeWith)
import Data.List (find, foldl', partition, sort)
import Data.Tree (Forest, Tree(..))
import Data.Csv (ToNamedRecord(..))
import Data.ByteString as ByteString (readFile)
import Data.List (sort)
import Data.Map as Map (lookup)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata
( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
, WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines )
, WithDefaultHeader(..), glue, readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader)
import GHC.Generics (Generic)
import Options.Applicative
......@@ -30,7 +33,7 @@ configParser = Config
<$> strOption root
<*> strOption inputTsvArg
where
root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY"
root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
<> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process"
......@@ -42,63 +45,54 @@ getConfig = execParser
(fullDesc
<> progDesc "A textometric tool to draw discursive profiles"))
data Position = Position
data Occurrence = Occurrence
{ totalSize :: Int
, position :: Int } deriving Generic
, position :: Int
, size :: Int } deriving Generic
instance ToNamedRecord Position
instance HasDefaultHeader Position where
defaultFields = DefaultFields ["position", "totalSize"]
instance ToNamedRecord Occurrence
instance HasDefaultHeader Occurrence where
defaultFields = DefaultFields ["position", "size", "totalSize"]
type Measure = ArticleRecord @ Position
type Measure = ArticleRecord @ Occurrence
profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
profile articleRecord = do
path <- asks ((</> relativePath articleRecord "conllu") . inputRoot)
liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path
path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
where
skipAndWarn msg = [] <$ warn msg
analyse = pure . fmap (glue articleRecord) . findOccurrences
findOccurrences :: Doc -> [Position]
findOccurrences doc =
Position total <$> (zip (reverse offsets) doc >>= imperativeVerb)
findOccurrences :: IndexedDocument -> [Occurrence]
findOccurrences (IndexedDocument {_total, _sentences}) =
groupOccurrences [] (_sentences >>= imperativeVerb)
--uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb)
where
(offsets, total) = foldl' next ([], 0) $ _words <$> doc
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence]
groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack
groupOccurrences [] (p:ps) = groupOccurrences [(p, 1)] ps
groupOccurrences (tmp@(first, tmpLength):stack) (p:ps)
| p == first + tmpLength =
groupOccurrences ((first, tmpLength + 1):stack) ps
| otherwise = groupOccurrences ((p, 1):tmp:stack) ps
getSID :: ID -> Maybe Int
getSID (SID n) = Just n
getSID _ = Nothing
imperativeVerb :: (Int, Sent) -> [Int]
imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False)
imperativeVerb :: IndexedSentence -> [Int]
imperativeVerb (IndexedSentence {_offset, _syntax}) =
sort (_syntax >>= findSpans False)
where
findSpans isImperative (Node {rootLabel, subForest}) =
case getMoods $ _feats rootLabel of
Just moods -> let nowImperative = moods == ["Imp"] in
case Map.lookup "Mood" $ _feats rootLabel of
Just moods -> let nowImperative = _values moods == ["Imp"] in
consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
Nothing ->
consIf rootLabel isImperative (findSpans isImperative =<< subForest)
consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id
getMoods = fmap _featValues . find (("Mood" ==) . _feat)
type Syntax = Forest (CW AW)
syntax :: Sent -> Syntax
syntax = build 0 . _words
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) }
pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw)
consIf a b = if b then (positions _offset a ++) else id
main :: IO ()
main = getConfig >>= runReaderT chain
where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do
liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)]
liftIO $ tsvLines [getHeader (for :: Measure)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
......@@ -2,7 +2,7 @@
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
import Conllu.Parse (parseConllu)
import Conllu.Tree (syntax)
import Conllu.Tree (indexDocument)
import Control.Applicative ((<**>))
import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Reader (MonadReader, asks, runReaderT)
......@@ -50,7 +50,7 @@ toTree articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
liftIO $ do
createDirectoryIfMissing True (takeDirectory path)
ByteString.writeFile path . encode $ syntax =<< doc
ByteString.writeFile path . encode $ indexDocument doc
main :: IO ()
main = getConfig >>= runReaderT chain
......
......@@ -2,48 +2,15 @@
from GEODE import fromTSV, toKey, toTSV
from GEODE.store import prepare
from GEODE.signal import gate, resample
from GEODE.signal import resample
import matplotlib.pyplot as plot
import seaborn
from sys import argv
def plotSingle(measure):
return gate(measure['position'], measure['totalSize'], offset=1)
histData = [0.3741792369121561, 0.7607808340727595, 0.8889086069210292,
0.7491570541259982, 1.2057675244010648, 1.0869565217391306,
1.096095829636202, 0.7124223602484473, 0.7251109139307897,
0.9335403726708075, 1.2277728482697428, 0.9582963620230699,
0.8540372670807452, 1.0281277728482696, 1.4221827861579415,
0.9218278615794143, 0.46814551907719604, 0.7717834960070986,
1.083762200532387, 0.65226264418811, 0.5771073646850046,
0.8025732031943212, 0.5266193433895296, 0.8911268855368234,
0.6836734693877551, 0.9039041703637977, 0.8720496894409939,
0.7113575865128662, 0.8984028393966283, 0.8993788819875776,
1.0016858917480034, 0.5857142857142857, 0.7364685004436559,
0.8047914818101152, 0.7055900621118011, 0.9018633540372669,
0.944010647737356, 0.9955634427684119, 1.0425909494232473,
0.9046140195208519, 0.8504880212954751, 1.1251109139307898,
0.44631765749778174, 0.49893522626441883, 0.6860692102928126,
0.7024844720496894, 0.4693877551020407, 1.5570541259982251,
0.8903283052351374, 0.6923691215616682, 0.8062999112688553,
1.0178349600709848, 0.5559006211180125, 0.7621118012422359,
0.848447204968944, 0.5782608695652174, 0.8464063886424137,
0.5537710736468501, 0.7160603371783496, 0.7982253771073646,
0.8371783496007098, 0.9143744454303461, 1.0799467613132205,
0.9581188997338067, 0.8597160603371785, 0.864685004436557,
1.2598935226264418, 1.3385093167701863, 0.45891748003549254,
0.9355811889973382, 0.6289263531499556, 0.7637089618456078,
0.7324755989352264, 0.754924578527063, 0.568589174800355,
0.49778172138420584, 0.7707187222715175, 1.0097604259094939,
0.8621118012422362, 0.8971606033717835, 1.1584738243123336,
1.1568766637089618, 0.7698314108251997, 0.9032830523513753,
0.5743566992014197, 0.8896184560780832, 0.7858917480035492,
0.9899733806566103, 1.617657497781721, 1.066725820763088,
0.6067435669920143, 1.1874001774622889, 1.0669920141969833,
1.1996450754214731, 1.4835847382431233, 1.6580301685891752,
2.2103815439219168, 2.4215616681455185, 2.7979591836734694,
5.278970718722271]
def gate(measure):
first, last = measure['position'], measure['position'] + measure['size']
return [1 if i >= first and i < last else 0
for i in range(1, 1 + measure['totalSize'])]
def plotProfile(profile, outputPath):
plot.figure(figsize=(16,13))
......@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath):
ax.fill_between(x, y, alpha=0.3)
plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight')
def sumProfiles(sameSizeProfiles):
return list(map(sum, zip(*sameSizeProfiles)))
def computeProfile(measures, resolution=100):
profile = [0]*resolution
for resampled in map(resample(resolution), map(plotSingle, measures)):
profile = list(map(sum, zip(profile, resampled)))
return [100*x/len(measures) for x in profile]
bySize, count = {}, 0
for measure in measures:
distribution = gate(measure)
count += measure['size']
l = len(distribution)
if l not in bySize:
bySize[l] = []
bySize[l].append(distribution)
resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
return [100*x/count for x in sumProfiles(list(resampled))]
def drawProfile(measures, outputPath):
def visualiseProfile(measures, outputPath):
profile = computeProfile(measures)
toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None)
plotProfile(profile, f"{outputPath}/profile.png")
if __name__ == '__main__':
drawProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], argv[2])
#plotProfile(histData, argv[2])
visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()],
argv[2])