diff --git a/EDdA/classification/__init__.py b/EDdA/classification/__init__.py index d04192abfac151935bdfdefe460731420518a29d..3d9c545d53ff0b15685c66bfd36bc9199838f589 100644 --- a/EDdA/classification/__init__.py +++ b/EDdA/classification/__init__.py @@ -1,3 +1,4 @@ from EDdA.classification.nGramsFrequencies import topNGrams from EDdA.classification.classSimilarities \ - import colinearity, confusionMatrix, keysIntersection, toPNG, metrics + import colinearity, confusionMatrix, keysIntersection, metrics +from EDdA.classification.visualization import heatmap, histogram, showGraph diff --git a/EDdA/classification/classSimilarities.py b/EDdA/classification/classSimilarities.py index e36e3902361984fd061e46effdcb953924f40bb4..8d38171af5cef2f6d922451ed1e475b805acdf00 100644 --- a/EDdA/classification/classSimilarities.py +++ b/EDdA/classification/classSimilarities.py @@ -1,7 +1,5 @@ from EDdA import data import math -import matplotlib.pyplot as plot -import seaborn def keysIntersection(d1, d2): return len(set(d1).intersection(d2)) @@ -32,12 +30,3 @@ def confusionMatrix(vectorizer, metric, domains=data.domains): for b in range(0, matrixSize): m[a][b] = metric(vectorizer(domains[a]), vectorizer(domains[b])) return m - -def toPNG(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs): - plot.figure(figsize=(16,13)) - if 'cmap' not in kwargs: - kwargs['cmap'] = 'Blues' - ax = seaborn.heatmap( - matrix, xticklabels=domains, yticklabels=domains, **kwargs - ) - plot.savefig(filePath, dpi=300, bbox_inches='tight') diff --git a/EDdA/classification/visualization.py b/EDdA/classification/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e97429f0aa6cda1adc1162593f67bc462944014c --- /dev/null +++ b/EDdA/classification/visualization.py @@ -0,0 +1,82 @@ +from EDdA import data +from EDdA.store import preparePath +import graphviz +from math import ceil +import matplotlib.cm as cm +import matplotlib +import matplotlib.pyplot as plot +import numpy +import seaborn + +def colorize(vmin, vmax, cmap): + mapper = cm.ScalarMappable( + norm=matplotlib.colors.Normalize(vmin=vmin, vmax=vmax, clip=True), + cmap=seaborn.color_palette(cmap, as_cmap=True) + ) + def toHexRGB(color): + (r, g, b, _) = mapper.to_rgba(color) + return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}" + return toHexRGB + +def showGraph(adjacency, filePath, cmap='Blues'): + edgeValues = [x for row in adjacency for x in row if x is not None] + color = colorize(min(edgeValues), max(edgeValues), cmap) + g = graphviz.Digraph() + g.graph_attr['rankdir'] = 'LR' + dimension = len(adjacency) + for i in range(0, dimension): + g.node(str(i), label=data.shortDomain(data.domains[i])) + for i in range(0, dimension): + for j in range(0, len(adjacency[i])): + link = adjacency[i][j] + if link is not None: + label = f"{link}" if type(link) == int else f"{link:.2f}" + g.edge(str(i), str(j), color=color(link), label=label) + #return Image(filename=g.render(preparePath(filePath), format='png')) + return g.render(preparePath(filePath), format='png') + +def heatmap(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs): + plot.figure(figsize=(16,13)) + if 'cmap' not in kwargs: + kwargs['cmap'] = 'Blues' + ax = seaborn.heatmap( + matrix, xticklabels=domains, yticklabels=domains, **kwargs + ) + plot.savefig(filePath, dpi=300, bbox_inches='tight') + +def topValue(delta, ticks=5): + i = 0 + if delta < 3: + return topValue(10*delta, ticks)/10 + elif delta > 100: + return topValue(delta/10, ticks)*10 + elif delta > 50: + return topValue(delta/5, ticks)*5 + else: + delta = ceil(delta) + while (delta + i) % ticks > 0: + i += 1 + return delta + i + +def histogram(keys, values, filePath, minValue=0, maxValue=None): + fig, ax = plot.subplots(figsize = (18, 6)) + maxValue = max(values) if maxValue is None else maxValue + delta = topValue(maxValue - minValue) + yTop = minValue + (1.1*delta) + yTicks = numpy.arange(minValue, yTop, delta / 5) + ax.hlines(y=yTicks, xmin=-1, xmax=len(keys)-1, color="#bfbfbf", lw=0.6) + + ax.set_xlim(-0.5, len(keys) - 0.5) + ax.set_ylim(0, yTop) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + bars = ax.bar(keys, values, width=0.5) + for bar in bars : + height = bar.get_height() + ax.text(bar.get_x() + bar.get_width()/2., height + (delta/30), + '%.4g' % height, ha='center', va='bottom', fontsize=12, rotation=90) + + plot.xticks(fontsize=12, rotation=45, ha='right') + plot.yticks(yTicks, fontsize=12) + fig.savefig(filePath, bbox_inches = 'tight', dpi=150) diff --git a/guix.scm b/guix.scm index ae0c88c27bb553b3ffe9bee6f97fb060060c91de..2c6c71a5219f77c8a693377d388dcd2eb11abf32 100644 --- a/guix.scm +++ b/guix.scm @@ -1,6 +1,7 @@ (use-modules ((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-xyz) #:select (python-matplotlib python-nltk + python-numpy python-seaborn)) ((gnu packages graphviz) #:select (graphviz python-graphviz)) (guix gexp) @@ -24,6 +25,7 @@ python-graphviz python-matplotlib python-nltk + python-numpy python-pandas python-seaborn )) diff --git a/notebooks/Confusion_Matrices.ipynb b/notebooks/Lexical_similarities.ipynb similarity index 77% rename from notebooks/Confusion_Matrices.ipynb rename to notebooks/Lexical_similarities.ipynb index 38cb20a4d6e1070e2c64cbdd9217e451fcc9e652..a7a6ae969d1a53e110398ee17e47b35f8ff6f97c 100644 --- a/notebooks/Confusion_Matrices.ipynb +++ b/notebooks/Lexical_similarities.ipynb @@ -5,7 +5,9 @@ "id": "11511929", "metadata": {}, "source": [ - "# Confusion Matrices\n", + "# Lexical similarities\n", + "\n", + "We want to study the n-grams similarities between the domain: a visual way to achieve this is to represent these similarities by confusion matrices, which is the format we used to visualize the errors of our models and will hence provide a base for comparison.\n", "\n", "We start by including the EDdA modules from the [project's gitlab](https://gitlab.liris.cnrs.fr/geode/EDdA-Classification)." ] @@ -19,7 +21,7 @@ "source": [ "from EDdA import data\n", "from EDdA.store import preparePath\n", - "from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams\n", + "from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams\n", "import os" ] }, @@ -46,7 +48,7 @@ "id": "4079559f", "metadata": {}, "source": [ - "We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`)." + "We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`), generating a PNG confusion matrix for each combination." ] }, { @@ -61,14 +63,14 @@ " vectorizer = topNGrams(source, n, ranks)\n", " for name in ['colinearity', 'keysIntersection']:\n", " imagePath = preparePath(f\"confusionMatrix/{source.hash}/{n}grams_top{ranks}_{name}.png\")\n", - " toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath)" + " heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", - "language": "/gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python", + "language": "/gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python", "name": "python3" }, "language_info": { diff --git a/scripts/confusionMatrices.py b/scripts/lexicalSimilarities.py similarity index 88% rename from scripts/confusionMatrices.py rename to scripts/lexicalSimilarities.py index 2e07dd237f04f87c27072e6b80bfe1a67bc9ed01..6081540f938ee8edd454167a01e8d547316b78ee 100644 --- a/scripts/confusionMatrices.py +++ b/scripts/lexicalSimilarities.py @@ -2,8 +2,7 @@ from EDdA import data from EDdA.store import preparePath -from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams -import os +from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams import sys def __syntax(this): @@ -25,7 +24,7 @@ def __compute(sourcePath, ns, ranksToTry, metricNames, root): vectorizer = topNGrams(source, n, ranks) for name in metricNames: imagePath = preparePath(f"{path}/{n}grams_top{ranks}_{name}.png") - toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath) + heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath) if __name__ == '__main__': argc = len(sys.argv)