Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/geopyck
1 result
Show changes
Commits on Source (3)
......@@ -2,6 +2,7 @@ from GEODE.Metadata import articleKey, toKey
from GEODE.Classification.Stopwords import isStopWord
from GEODE.Store import tabular
import nltk
import pandas
from string import punctuation
tools = {'lemmatize': None, 'spacyFr': None}
......
......@@ -57,8 +57,8 @@ def getArgs(arguments):
cli.add_argument('inputFile')
cli.add_argument('outputJSON')
add_labels_argument(cli)
cli.add_argument('-m', '--maximal',
action='store_const', const=True, default=False)
cli.add_argument('-m', '--maximal', action='store_const', const=True,
default=False, help="Simplify the matrix into an MCM")
return cli.parse_args(arguments)
def extractConfusionMatrixCLI(arguments):
......
import argparse
from GEODE.Classification.NGrams import loadFrequencies
from GEODE.Store import JSON
from GEODE.Visualisation.Label import add_labels_argument, getLabels
import math
def keysIntersection(d1, d2):
return len(set(d1).intersection(d2)) / len(d1)
def scalarProduct(d1, d2):
return sum([d1[k] * d2[k] for k in set(d1.keys()).intersection(d2)])
def norm(d):
return math.sqrt(scalarProduct(d, d))
def colinearity(d1, d2):
return scalarProduct(d1, d2) / (norm(d1) * norm(d2))
metrics = {f.__name__: f for f in [colinearity, keysIntersection]}
def getLexicalSimilarities(vectors, metric):
m = []
dimension = len(vectors)
for a in range(0, dimension):
m.append(dimension * [None])
for b in range(0, dimension):
m[a][b] = metric(vectors[a], vectors[b])
return m
def extractLexicalSimilarities(inputDir, outputJSON, labels, metric, top=None):
vectors = [loadFrequencies(f"{inputDir}/{domain}.tsv", top=top)
for domain in labels]
matrix = getLexicalSimilarities(vectors, metrics[metric])
JSON.save({'matrix': matrix, 'labels': labels}, outputJSON)
def getArgs(arguments):
description = "Extract a similarity matrix from n-grams features"
cli = argparse.ArgumentParser(prog='similarityMatrix',
description=description)
cli.add_argument('inputDir', help="path containing the n-grams for each class")
cli.add_argument('outputJSON')
add_labels_argument(cli)
cli.add_argument(
'-m', '--metric',
help="metric used to compare the classes (one of: {names})".format(
names=metrics.keys()))
cli.add_argument('-t', '--top', type=int,
help="number of top elements compared from each class")
return cli.parse_args(arguments)
def extractLexicalSimilaritiesCLI(arguments):
args = getArgs(arguments)
labels = getLabels(args)
extractLexicalSimilarities(args.inputDir,
args.outputJSON,
labels,
args.metric,
top=args.top)
......@@ -5,3 +5,4 @@ from GEODE.Visualisation.DensityProfile import densityProfile, \
from GEODE.Visualisation.DrawMatrix import drawMatrix
from GEODE.Visualisation.Graph import drawGraph
from GEODE.Visualisation.Legend import trim as legend
from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilarities
......@@ -29,12 +29,14 @@ from GEODE.Visualisation.ConfusionMatrix import extractConfusionMatrixCLI
from GEODE.Visualisation.DensityProfile import drawDensityProfileCLI
from GEODE.Visualisation.DrawMatrix import drawMatrixCLI
from GEODE.Visualisation.Graph import drawGraphCLI
from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilaritiesCLI
commands = {
'confusionMatrix': extractConfusionMatrixCLI,
'densityProfile': drawDensityProfileCLI,
'drawMatrix': drawMatrixCLI,
'graph': drawGraphCLI
'graph': drawGraphCLI,
'lexicalSimilarities': extractLexicalSimilaritiesCLI
}
def geopyckCLI():
......