diff --git a/GEODE/Visualisation/LexicalSimilarities.py b/GEODE/Visualisation/LexicalSimilarities.py new file mode 100644 index 0000000000000000000000000000000000000000..ed7ba848fdfc016bc0821144a779e6c6542203bc --- /dev/null +++ b/GEODE/Visualisation/LexicalSimilarities.py @@ -0,0 +1,58 @@ +import argparse +from GEODE.Classification.NGrams import loadFrequencies +from GEODE.Store import JSON +from GEODE.Visualisation.Label import add_labels_argument, getLabels +import math + +def keysIntersection(d1, d2): + return len(set(d1).intersection(d2)) / len(d1) + +def scalarProduct(d1, d2): + return sum([d1[k] * d2[k] for k in set(d1.keys()).intersection(d2)]) + +def norm(d): + return math.sqrt(scalarProduct(d, d)) + +def colinearity(d1, d2): + return scalarProduct(d1, d2) / (norm(d1) * norm(d2)) + +metrics = {f.__name__: f for f in [colinearity, keysIntersection]} + +def getLexicalSimilarities(vectors, metric): + m = [] + dimension = len(vectors) + for a in range(0, dimension): + m.append(dimension * [None]) + for b in range(0, dimension): + m[a][b] = metric(vectors[a], vectors[b]) + return m + +def extractLexicalSimilarities(inputDir, outputJSON, labels, metric, top=None): + vectors = [loadFrequencies(f"{inputDir}/{domain}.tsv", top=top) + for domain in labels] + matrix = getLexicalSimilarities(vectors, metrics[metric]) + JSON.save({'matrix': matrix, 'labels': labels}, outputJSON) + +def getArgs(arguments): + description = "Extract a similarity matrix from n-grams features" + cli = argparse.ArgumentParser(prog='similarityMatrix', + description=description) + cli.add_argument('inputDir', help="path containing the n-grams for each class") + cli.add_argument('outputJSON') + add_labels_argument(cli) + cli.add_argument( + '-m', '--metric', + help="metric used to compare the classes (one of: {names})".format( + names=metrics.keys())) + cli.add_argument('-t', '--top', type=int, + help="number of top elements compared from each class") + return cli.parse_args(arguments) + +def extractLexicalSimilaritiesCLI(arguments): + args = getArgs(arguments) + labels = getLabels(args) + extractLexicalSimilarities(args.inputDir, + args.outputJSON, + labels, + args.metric, + top=args.top) diff --git a/GEODE/Visualisation/__init__.py b/GEODE/Visualisation/__init__.py index 3cc602b36c656a14eadf6cdd1efdccd12b4e90bb..7b23498ce2752eea41649da4d1fdee473314582d 100644 --- a/GEODE/Visualisation/__init__.py +++ b/GEODE/Visualisation/__init__.py @@ -5,3 +5,4 @@ from GEODE.Visualisation.DensityProfile import densityProfile, \ from GEODE.Visualisation.DrawMatrix import drawMatrix from GEODE.Visualisation.Graph import drawGraph from GEODE.Visualisation.Legend import trim as legend +from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilarities diff --git a/GEODE/__init__.py b/GEODE/__init__.py index a3fd4123c92aef7df0134d17b3f8fca292077863..53acc070e2cf37a3069cec57427652ce34c51670 100644 --- a/GEODE/__init__.py +++ b/GEODE/__init__.py @@ -29,12 +29,14 @@ from GEODE.Visualisation.ConfusionMatrix import extractConfusionMatrixCLI from GEODE.Visualisation.DensityProfile import drawDensityProfileCLI from GEODE.Visualisation.DrawMatrix import drawMatrixCLI from GEODE.Visualisation.Graph import drawGraphCLI +from GEODE.Visualisation.LexicalSimilarities import extractLexicalSimilaritiesCLI commands = { 'confusionMatrix': extractConfusionMatrixCLI, 'densityProfile': drawDensityProfileCLI, 'drawMatrix': drawMatrixCLI, - 'graph': drawGraphCLI + 'graph': drawGraphCLI, + 'lexicalSimilarities': extractLexicalSimilaritiesCLI } def geopyckCLI():