Skip to content
Snippets Groups Projects
Commit 226437a9 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a script to compute lexical similarities

parent 85c082af
No related branches found
No related tags found
No related merge requests found
import argparse
from GEODE.Classification.NGrams import loadFrequencies
from GEODE.Store import JSON
from GEODE.Visualisation.Label import add_labels_argument, getLabels
import math
def keysIntersection(d1, d2):
return len(set(d1).intersection(d2)) / len(d1)
def scalarProduct(d1, d2):
return sum([d1[k] * d2[k] for k in set(d1.keys()).intersection(d2)])
def norm(d):
return math.sqrt(scalarProduct(d, d))
def colinearity(d1, d2):
return scalarProduct(d1, d2) / (norm(d1) * norm(d2))
metrics = {f.__name__: f for f in [colinearity, keysIntersection]}
def getLexicalSimilarities(vectors, metric):
m = []
dimension = len(vectors)
for a in range(0, dimension):
m.append(dimension * [None])
for b in range(0, dimension):
m[a][b] = metric(vectors[a], vectors[b])
return m
def extractLexicalSimilarities(inputDir, outputJSON, labels, metric, top=None):
vectors = [loadFrequencies(f"{inputDir}/{domain}.tsv", top=top)
for domain in labels]
matrix = getLexicalSimilarities(vectors, metrics[metric])
JSON.save({'matrix': matrix, 'labels': labels}, outputJSON)
def getArgs(arguments):
description = "Extract a similarity matrix from n-grams features"
cli = argparse.ArgumentParser(prog='similarityMatrix',
description=description)
cli.add_argument('inputDir', help="path containing the n-grams for each class")
cli.add_argument('outputJSON')
add_labels_argument(cli)
cli.add_argument(
'-m', '--metric',
help="metric used to compare the classes (one of: {names})".format(
names=metrics.keys()))
cli.add_argument('-t', '--top', type=int,
help="number of top elements compared from each class")
return cli.parse_args(arguments)
def extractLexicalSimilaritiesCLI(arguments):
args = getArgs(arguments)
labels = getLabels(args)
extractLexicalSimilarities(args.inputDir,
args.outputJSON,
labels,
args.metric,
top=args.top)
...@@ -5,3 +5,4 @@ from GEODE.Visualisation.DensityProfile import densityProfile, \ ...@@ -5,3 +5,4 @@ from GEODE.Visualisation.DensityProfile import densityProfile, \
from GEODE.Visualisation.DrawMatrix import drawMatrix from GEODE.Visualisation.DrawMatrix import drawMatrix
from GEODE.Visualisation.Graph import drawGraph from GEODE.Visualisation.Graph import drawGraph
from GEODE.Visualisation.Legend import trim as legend from GEODE.Visualisation.Legend import trim as legend
from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilarities
...@@ -29,12 +29,14 @@ from GEODE.Visualisation.ConfusionMatrix import extractConfusionMatrixCLI ...@@ -29,12 +29,14 @@ from GEODE.Visualisation.ConfusionMatrix import extractConfusionMatrixCLI
from GEODE.Visualisation.DensityProfile import drawDensityProfileCLI from GEODE.Visualisation.DensityProfile import drawDensityProfileCLI
from GEODE.Visualisation.DrawMatrix import drawMatrixCLI from GEODE.Visualisation.DrawMatrix import drawMatrixCLI
from GEODE.Visualisation.Graph import drawGraphCLI from GEODE.Visualisation.Graph import drawGraphCLI
from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilaritiesCLI
commands = { commands = {
'confusionMatrix': extractConfusionMatrixCLI, 'confusionMatrix': extractConfusionMatrixCLI,
'densityProfile': drawDensityProfileCLI, 'densityProfile': drawDensityProfileCLI,
'drawMatrix': drawMatrixCLI, 'drawMatrix': drawMatrixCLI,
'graph': drawGraphCLI 'graph': drawGraphCLI,
'lexicalSimilarities': extractLexicalSimilaritiesCLI
} }
def geopyckCLI(): def geopyckCLI():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment