Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/outillage
1 result
Show changes
Commits on Source (3)
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
{-# LANGUAGE DeriveGeneric, OverloadedStrings, TypeFamilies #-}
module Conllu.Tree
( Feat(..)
( EP(..)
, Feat(..)
, ID(..)
, IndexedDocument(..)
, IndexedSentence(..)
, IndexedWord(..)
, POS(..)
, Rel(..)
, indexDocument
, indexSentence
, indexWord
......@@ -13,21 +16,32 @@ module Conllu.Tree
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.ByteString.Char8 as ByteString (pack)
import Data.Csv ((.=), ToField(..), ToNamedRecord(..), namedRecord)
import Data.Int (Int8)
import Data.List (partition)
import Data.Map as Map (Map, empty, insert)
import Data.List (intercalate, partition)
import Data.Map as Map (Map, empty, insert, toList)
import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..))
import GEODE.Metadata (DefaultFields(..), HasDefaultHeader(..))
import GHC.Generics (Generic(..), K1(..), Rec0)
data ID =
SID Conllu.Index
| MID Conllu.Index Conllu.Index
| EID Conllu.Index Conllu.Index
deriving (Show, Generic)
deriving Generic
instance Show ID where
show (SID i) = show i
show (MID i j) = show i ++ '-': show j
show (EID i j) = show i ++ '.': show j
instance Serialize ID
instance ToField ID where
toField = ByteString.pack . show
enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum
......@@ -41,6 +55,9 @@ instance Generic POS where
instance Serialize POS
instance ToField POS where
toField (POS p) = ByteString.pack $ show p
data Feat = Feat
{ _values :: [String]
, _type :: Maybe String } deriving (Show, Generic)
......@@ -67,6 +84,13 @@ instance Serialize Rel
type FEATS = Map String Feat
instance ToField FEATS where
toField = ByteString.pack . intercalate "|" . fmap showFeat . toList
where
showFeat (k, Feat {_values, _type}) =
k ++ '=':(intercalate "," _values ++ maybe "" showType _type)
showType t = '[': t ++ "]"
data IndexedWord = IndexedWord
{ _id :: ID
, _form :: Conllu.FORM
......@@ -78,6 +102,29 @@ data IndexedWord = IndexedWord
, _deps :: [Rel]
, _misc :: Conllu.MISC } deriving (Show, Generic)
instance ToNamedRecord IndexedWord where
toNamedRecord indexedWord = namedRecord
[ "id" .= _id indexedWord
, "form" .= _form indexedWord
, "lemma" .= _lemma indexedWord
, "upos" .= _upos indexedWord
, "xpos" .= _xpos indexedWord
, "feats" .= _feats indexedWord
, "head" .= (_head <$> _rel indexedWord)
, "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord)
, "deps" .= ("|" `sep` (showDep <$> _deps indexedWord))
, "misc" .= _misc indexedWord ]
where
sep = intercalate
deprel (Rel {_deprel = EP ep, _subdep, _rest}) =
show ep : maybe [] id ((:) <$> _subdep <*> _rest)
showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel)
instance HasDefaultHeader IndexedWord where
defaultFields = DefaultFields
[ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
, "misc" ]
instance Serialize IndexedWord
data IndexedSentence = IndexedSentence
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE ExplicitNamespaces #-}
import qualified Conllu.UposTagset as Conllu (POS(NOUN))
import qualified Conllu.DeprelTagset as Conllu (EP(APPOS))
import Conllu.Tree
( EP(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..), POS(..)
, Rel(..) )
import Control.Applicative ((<**>), (<|>))
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString as ByteString (readFile)
import Data.Maybe (listToMaybe)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata
( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue
, readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader)
import Options.Applicative
( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
, short, strOption )
import System.FilePath ((</>))
import System.Script (try, warn)
data Config = Config
{ inputRoot :: FilePath
, inputTsv :: FilePath }
configParser :: Parser Config
configParser = Config
<$> strOption root
<*> strOption inputTsvArg
where
root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
<> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process"
getConfig :: IO Config
getConfig = execParser
(info
(configParser <**> helper)
(fullDesc
<> progDesc "A textometric tool to extract nouns at the top of the first sentence"))
type Result = ArticleRecord @ IndexedWord
profile :: ArticleRecord -> ReaderT Config IO (Maybe (WithDefaultHeader Result))
profile articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
where
skipAndWarn msg = Nothing <$ warn msg
analyse = pure . fmap (glue articleRecord) . searchDocument
searchDocument :: IndexedDocument -> Maybe IndexedWord
searchDocument (IndexedDocument {_sentences}) =
listToMaybe _sentences >>= firstTopNOUN
firstTopNOUN :: IndexedSentence -> Maybe IndexedWord
firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop
where
fromTop (Node {rootLabel, subForest})
| isNoun (_upos rootLabel) = Just rootLabel
| otherwise = foldl (<|>) Nothing (apposNoun <$> subForest)
apposNoun (Node {rootLabel})
| isNoun (_upos rootLabel) && isAppos (_deprel <$> _rel rootLabel) =
Just rootLabel
| otherwise = Nothing
isNoun (Just (POS Conllu.NOUN)) = True
isNoun _ = False
isAppos (Just (EP Conllu.APPOS)) = True
isAppos _ = False
main :: IO ()
main = getConfig >>= runReaderT chain
where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do
liftIO $ tsvLines [getHeader (for :: Result)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows
......@@ -12,9 +12,10 @@ def gate(measure):
return [1 if i >= first and i < last else 0
for i in range(1, 1 + measure['totalSize'])]
def plotProfile(profile, outputPath):
def plotDensity(profile, outputPath):
plot.figure(figsize=(16,13))
ax = seaborn.lineplot(profile)
l = len(profile)
ax = seaborn.lineplot(x=[100*i/(l-1) for i in range(l)], y=profile)
ax.set_xlabel("Position")
ax.set_xlim(0, 100)
ax.set_ylim(0)
......@@ -29,7 +30,7 @@ def plotProfile(profile, outputPath):
def sumProfiles(sameSizeProfiles):
return list(map(sum, zip(*sameSizeProfiles)))
def computeProfile(measures, resolution=100):
def computeProfile(measures, resolution):
bySize, count = {}, 0
for measure in measures:
distribution = gate(measure)
......@@ -39,13 +40,28 @@ def computeProfile(measures, resolution=100):
bySize[l] = []
bySize[l].append(distribution)
resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
return [100*x/count for x in sumProfiles(list(resampled))]
return [resolution*x/count for x in sumProfiles(list(resampled))]
def visualiseProfile(measures, outputPath):
profile = computeProfile(measures)
def visualiseProfile(measures, outputPath, resolution=100):
profile = computeProfile(measures, resolution)
toTSV(prepare(f"{outputPath}/profile.tsv"), profile, sortBy=None)
plotProfile(profile, f"{outputPath}/profile.png")
plotDensity(profile, f"{outputPath}/profile.png")
def parseArgs(args):
positional = []
kwargs = {}
i = 0
while i < len(args):
if args[i] == '--resolution':
kwargs['resolution'] = int(args[i+1])
i += 1
else:
positional.append(args[i])
i += 1
return positional, kwargs
if __name__ == '__main__':
visualiseProfile([measure for _, measure in fromTSV(argv[1]).iterrows()],
argv[2])
args, kwargs = parseArgs(argv[1:])
visualiseProfile([measure for _, measure in fromTSV(args[0]).iterrows()],
args[1],
**kwargs)