Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/outillage
1 result
Show changes
Commits on Source (3)
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} {-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
module Conllu.Tree module Conllu.Tree
( Syntax ( Feat(..)
, syntax ) where , ID(..)
, IndexedDocument(..)
import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) , IndexedSentence(..)
, IndexedWord(..)
, indexDocument
, indexSentence
, indexWord
, positions ) where
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP) import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.Int (Int8)
import Data.List (partition) import Data.List (partition)
import Data.Map as Map (Map, empty, insert) import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..)) import Data.Serialize (Serialize(..))
...@@ -20,13 +28,16 @@ data ID = ...@@ -20,13 +28,16 @@ data ID =
instance Serialize ID instance Serialize ID
enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum
newtype POS = POS Conllu.POS deriving Show newtype POS = POS Conllu.POS deriving Show
instance Generic POS where instance Generic POS where
type Rep POS = Rec0 Int type Rep POS = Rec0 Int8
from (POS c) = K1 (fromEnum c) from (POS c) = K1 (enumCast c)
to (K1 i) = POS (toEnum i) to (K1 i) = POS (enumCast i)
instance Serialize POS instance Serialize POS
...@@ -38,11 +49,11 @@ instance Serialize Feat ...@@ -38,11 +49,11 @@ instance Serialize Feat
newtype EP = EP Conllu.EP deriving Show newtype EP = EP Conllu.EP deriving Show
instance Generic EP where instance Generic EP where
type Rep EP = Rec0 Int type Rep EP = Rec0 Int8
from (EP c) = K1 (fromEnum c) from (EP c) = K1 (enumCast c)
to (K1 i) = EP (toEnum i) to (K1 i) = EP (enumCast i)
instance Serialize EP instance Serialize EP
...@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord ...@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord
instance Serialize IndexedWord instance Serialize IndexedWord
type Syntax = Forest IndexedWord data IndexedSentence = IndexedSentence
{ _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic)
getSID :: Conllu.ID -> Maybe Int instance Serialize IndexedSentence
getSID (Conllu.SID n) = Just n
getSID _ = Nothing data IndexedDocument = IndexedDocument
{ _total :: Int
, _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument
idOfConllu :: Conllu.ID -> ID idOfConllu :: Conllu.ID -> ID
idOfConllu (Conllu.SID i) = SID i idOfConllu (Conllu.SID i) = SID i
idOfConllu (Conllu.MID i j) = MID i j idOfConllu (Conllu.MID i j) = MID i j
idOfConllu (Conllu.EID i j) = EID i j idOfConllu (Conllu.EID i j) = EID i j
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int]
positions offset = fmap (offset +) . getIDs . _id
where
getIDs (SID i) = [i]
getIDs (MID i j) = [i..j]
getIDs _ = []
featsOfConllu :: [Conllu.Feat] -> FEATS featsOfConllu :: [Conllu.Feat] -> FEATS
featsOfConllu = foldr indexFeat Map.empty featsOfConllu = foldr indexFeat Map.empty
where where
...@@ -107,13 +135,22 @@ indexWord cw = IndexedWord ...@@ -107,13 +135,22 @@ indexWord cw = IndexedWord
, _deps = relOfConllu <$> Conllu._deps cw , _deps = relOfConllu <$> Conllu._deps cw
, _misc = Conllu._misc cw } , _misc = Conllu._misc cw }
syntax :: Conllu.Sent -> Syntax indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
syntax = build 0 . Conllu._words indexSentence offset = IndexedSentence offset . build 0
where where
build n cws = build n cws =
let (pointing, others) = partition (pointingTo n) cws in let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing recurseOn others <$> pointing
recurseOn cws cw = Node recurseOn cws cw = Node
{ rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } { rootLabel = indexWord cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc
where
addSentences (_total, sentencesStack) [] =
IndexedDocument {_total, _sentences = reverse sentencesStack}
addSentences (partial, sentencesStack) (sentence:others) =
let indexed = indexSentence partial sentence in
addSentences (partial + length sentence, indexed:sentencesStack) others
...@@ -3,9 +3,6 @@ import math ...@@ -3,9 +3,6 @@ import math
def curry(f): def curry(f):
return lambda x: (lambda *args: f(x, *args)) return lambda x: (lambda *args: f(x, *args))
def gate(n, size, offset=0):
return [1 if i == n else 0 for i in range(offset, offset+size)]
@curry @curry
def orientedIntersection(l, sNew, sOld): def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1]) left = max(sNew*l[0], sOld*l[1])
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
((gnu packages haskell) #:select (ghc)) ((gnu packages haskell) #:select (ghc))
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava ((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-cereal
ghc-hs-conllu ghc-hs-conllu
ghc-random ghc-random
ghc-regex-tdfa)) ghc-regex-tdfa))
...@@ -30,6 +31,7 @@ ...@@ -30,6 +31,7 @@
ghc ; running haskell ghc ; running haskell
ghc-aeson ; working with JSON in haskell ghc-aeson ; working with JSON in haskell
ghc-cassava ; working with CSV in haskell ghc-cassava ; working with CSV in haskell
ghc-cereal ; serializing haskell objects to/from disk
ghc-geode ; handling corpus files ghc-geode ; handling corpus files
ghc-hs-conllu ; working on syntax-annotated documents ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents ghc-hxt ; working on xml documents
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-} {-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
import Conllu.Parse (parseConllu) import Conllu.Tree
import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..)) ( Feat(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..)
, positions )
import Control.Applicative ((<**>)) import Control.Applicative ((<**>))
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT) import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString.Lazy as ByteString (putStr) import Data.Csv (ToNamedRecord(..))
import Data.Csv (ToNamedRecord(..), encodeWith) import Data.ByteString as ByteString (readFile)
import Data.List (find, foldl', partition, sort) import Data.List (sort)
import Data.Tree (Forest, Tree(..)) import Data.Map as Map (lookup)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata import GEODE.Metadata
( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
, WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines ) , WithDefaultHeader(..), glue, readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader) import GEODE.Metadata.TSV.Header (for, getHeader)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Options.Applicative import Options.Applicative
...@@ -30,7 +33,7 @@ configParser = Config ...@@ -30,7 +33,7 @@ configParser = Config
<$> strOption root <$> strOption root
<*> strOption inputTsvArg <*> strOption inputTsvArg
where where
root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY" root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
<> help "path to the base directory containing the syntax annotations" <> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process" <> help "TSV file containing the article records to process"
...@@ -42,63 +45,54 @@ getConfig = execParser ...@@ -42,63 +45,54 @@ getConfig = execParser
(fullDesc (fullDesc
<> progDesc "A textometric tool to draw discursive profiles")) <> progDesc "A textometric tool to draw discursive profiles"))
data Position = Position data Occurrence = Occurrence
{ totalSize :: Int { totalSize :: Int
, position :: Int } deriving Generic , position :: Int
, size :: Int } deriving Generic
instance ToNamedRecord Position instance ToNamedRecord Occurrence
instance HasDefaultHeader Position where instance HasDefaultHeader Occurrence where
defaultFields = DefaultFields ["position", "totalSize"] defaultFields = DefaultFields ["position", "size", "totalSize"]
type Measure = ArticleRecord @ Position type Measure = ArticleRecord @ Occurrence
profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure] profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
profile articleRecord = do profile articleRecord = do
path <- asks ((</> relativePath articleRecord "conllu") . inputRoot) path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
where where
skipAndWarn msg = [] <$ warn msg skipAndWarn msg = [] <$ warn msg
analyse = pure . fmap (glue articleRecord) . findOccurrences analyse = pure . fmap (glue articleRecord) . findOccurrences
findOccurrences :: Doc -> [Position] findOccurrences :: IndexedDocument -> [Occurrence]
findOccurrences doc = findOccurrences (IndexedDocument {_total, _sentences}) =
Position total <$> (zip (reverse offsets) doc >>= imperativeVerb) groupOccurrences [] (_sentences >>= imperativeVerb)
--uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb)
where where
(offsets, total) = foldl' next ([], 0) $ _words <$> doc groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence]
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack
groupOccurrences [] (p:ps) = groupOccurrences [(p, 1)] ps
groupOccurrences (tmp@(first, tmpLength):stack) (p:ps)
| p == first + tmpLength =
groupOccurrences ((first, tmpLength + 1):stack) ps
| otherwise = groupOccurrences ((p, 1):tmp:stack) ps
getSID :: ID -> Maybe Int imperativeVerb :: IndexedSentence -> [Int]
getSID (SID n) = Just n imperativeVerb (IndexedSentence {_offset, _syntax}) =
getSID _ = Nothing sort (_syntax >>= findSpans False)
imperativeVerb :: (Int, Sent) -> [Int]
imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False)
where where
findSpans isImperative (Node {rootLabel, subForest}) = findSpans isImperative (Node {rootLabel, subForest}) =
case getMoods $ _feats rootLabel of case Map.lookup "Mood" $ _feats rootLabel of
Just moods -> let nowImperative = moods == ["Imp"] in Just moods -> let nowImperative = _values moods == ["Imp"] in
consIf rootLabel nowImperative (findSpans nowImperative =<< subForest) consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
Nothing -> Nothing ->
consIf rootLabel isImperative (findSpans isImperative =<< subForest) consIf rootLabel isImperative (findSpans isImperative =<< subForest)
consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id consIf a b = if b then (positions _offset a ++) else id
getMoods = fmap _featValues . find (("Mood" ==) . _feat)
type Syntax = Forest (CW AW)
syntax :: Sent -> Syntax
syntax = build 0 . _words
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) }
pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw)
main :: IO () main :: IO ()
main = getConfig >>= runReaderT chain main = getConfig >>= runReaderT chain
where where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do searchAndDisplay rows = do
liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)] liftIO $ tsvLines [getHeader (for :: Measure)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
import Conllu.Parse (parseConllu) import Conllu.Parse (parseConllu)
import Conllu.Tree (syntax) import Conllu.Tree (indexDocument)
import Control.Applicative ((<**>)) import Control.Applicative ((<**>))
import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Reader (MonadReader, asks, runReaderT) import Control.Monad.Reader (MonadReader, asks, runReaderT)
...@@ -50,7 +50,7 @@ toTree articleRecord = do ...@@ -50,7 +50,7 @@ toTree articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . outputRoot) path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
liftIO $ do liftIO $ do
createDirectoryIfMissing True (takeDirectory path) createDirectoryIfMissing True (takeDirectory path)
ByteString.writeFile path . encode $ syntax =<< doc ByteString.writeFile path . encode $ indexDocument doc
main :: IO () main :: IO ()
main = getConfig >>= runReaderT chain main = getConfig >>= runReaderT chain
......
...@@ -2,48 +2,15 @@ ...@@ -2,48 +2,15 @@
from GEODE import fromTSV, toKey, toTSV from GEODE import fromTSV, toKey, toTSV
from GEODE.store import prepare from GEODE.store import prepare
from GEODE.signal import gate, resample from GEODE.signal import resample
import matplotlib.pyplot as plot import matplotlib.pyplot as plot
import seaborn import seaborn
from sys import argv from sys import argv
def plotSingle(measure): def gate(measure):
return gate(measure['position'], measure['totalSize'], offset=1) first, last = measure['position'], measure['position'] + measure['size']
return [1 if i >= first and i < last else 0
histData = [0.3741792369121561, 0.7607808340727595, 0.8889086069210292, for i in range(1, 1 + measure['totalSize'])]
0.7491570541259982, 1.2057675244010648, 1.0869565217391306,
1.096095829636202, 0.7124223602484473, 0.7251109139307897,
0.9335403726708075, 1.2277728482697428, 0.9582963620230699,
0.8540372670807452, 1.0281277728482696, 1.4221827861579415,
0.9218278615794143, 0.46814551907719604, 0.7717834960070986,
1.083762200532387, 0.65226264418811, 0.5771073646850046,
0.8025732031943212, 0.5266193433895296, 0.8911268855368234,
0.6836734693877551, 0.9039041703637977, 0.8720496894409939,
0.7113575865128662, 0.8984028393966283, 0.8993788819875776,
1.0016858917480034, 0.5857142857142857, 0.7364685004436559,
0.8047914818101152, 0.7055900621118011, 0.9018633540372669,
0.944010647737356, 0.9955634427684119, 1.0425909494232473,
0.9046140195208519, 0.8504880212954751, 1.1251109139307898,
0.44631765749778174, 0.49893522626441883, 0.6860692102928126,
0.7024844720496894, 0.4693877551020407, 1.5570541259982251,
0.8903283052351374, 0.6923691215616682, 0.8062999112688553,
1.0178349600709848, 0.5559006211180125, 0.7621118012422359,
0.848447204968944, 0.5782608695652174, 0.8464063886424137,
0.5537710736468501, 0.7160603371783496, 0.7982253771073646,
0.8371783496007098, 0.9143744454303461, 1.0799467613132205,
0.9581188997338067, 0.8597160603371785, 0.864685004436557,
1.2598935226264418, 1.3385093167701863, 0.45891748003549254,
0.9355811889973382, 0.6289263531499556, 0.7637089618456078,
0.7324755989352264, 0.754924578527063, 0.568589174800355,
0.49778172138420584, 0.7707187222715175, 1.0097604259094939,
0.8621118012422362, 0.8971606033717835, 1.1584738243123336,
1.1568766637089618, 0.7698314108251997, 0.9032830523513753,
0.5743566992014197, 0.8896184560780832, 0.7858917480035492,
0.9899733806566103, 1.617657497781721, 1.066725820763088,
0.6067435669920143, 1.1874001774622889, 1.0669920141969833,
1.1996450754214731, 1.4835847382431233, 1.6580301685891752,
2.2103815439219168, 2.4215616681455185, 2.7979591836734694,
5.278970718722271]
def plotProfile(profile, outputPath): def plotProfile(profile, outputPath):
plot.figure(figsize=(16,13)) plot.figure(figsize=(16,13))
...@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath): ...@@ -59,17 +26,26 @@ def plotProfile(profile, outputPath):
ax.fill_between(x, y, alpha=0.3) ax.fill_between(x, y, alpha=0.3)
plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight') plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight')
def sumProfiles(sameSizeProfiles):
return list(map(sum, zip(*sameSizeProfiles)))
def computeProfile(measures, resolution=100): def computeProfile(measures, resolution=100):
profile = [0]*resolution bySize, count = {}, 0
for resampled in map(resample(resolution), map(plotSingle, measures)): for measure in measures:
profile = list(map(sum, zip(profile, resampled))) distribution = gate(measure)
return [100*x/len(measures) for x in profile] count += measure['size']
l = len(distribution)
if l not in bySize:
bySize[l] = []
bySize[l].append(distribution)
resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
return [100*x/count for x in sumProfiles(list(resampled))]
def drawProfile(measures, outputPath): def visualiseProfile(measures, outputPath):
profile = computeProfile(measures) profile = computeProfile(measures)
toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None) toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None)
plotProfile(profile, f"{outputPath}/profile.png") plotProfile(profile, f"{outputPath}/profile.png")
if __name__ == '__main__': if __name__ == '__main__':
drawProfile([measure for _, measure in fromTSV(argv[1]).iterrows()], argv[2]) visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()],
#plotProfile(histData, argv[2]) argv[2])