Skip to content
Snippets Groups Projects
Commit 030d12c2 authored by Alice Brenon's avatar Alice Brenon
Browse files

Split visualization functions into a new module

parent 85986ba9
No related branches found
No related tags found
No related merge requests found
from EDdA.classification.nGramsFrequencies import topNGrams from EDdA.classification.nGramsFrequencies import topNGrams
from EDdA.classification.classSimilarities \ from EDdA.classification.classSimilarities \
import colinearity, confusionMatrix, keysIntersection, toPNG, metrics import colinearity, confusionMatrix, keysIntersection, metrics
from EDdA.classification.visualization import heatmap, histogram, showGraph
from EDdA import data from EDdA import data
import math import math
import matplotlib.pyplot as plot
import seaborn
def keysIntersection(d1, d2): def keysIntersection(d1, d2):
return len(set(d1).intersection(d2)) return len(set(d1).intersection(d2))
...@@ -32,12 +30,3 @@ def confusionMatrix(vectorizer, metric, domains=data.domains): ...@@ -32,12 +30,3 @@ def confusionMatrix(vectorizer, metric, domains=data.domains):
for b in range(0, matrixSize): for b in range(0, matrixSize):
m[a][b] = metric(vectorizer(domains[a]), vectorizer(domains[b])) m[a][b] = metric(vectorizer(domains[a]), vectorizer(domains[b]))
return m return m
def toPNG(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs):
plot.figure(figsize=(16,13))
if 'cmap' not in kwargs:
kwargs['cmap'] = 'Blues'
ax = seaborn.heatmap(
matrix, xticklabels=domains, yticklabels=domains, **kwargs
)
plot.savefig(filePath, dpi=300, bbox_inches='tight')
from EDdA import data
from EDdA.store import preparePath
import graphviz
from math import ceil
import matplotlib.cm as cm
import matplotlib
import matplotlib.pyplot as plot
import numpy
import seaborn
def colorize(vmin, vmax, cmap):
mapper = cm.ScalarMappable(
norm=matplotlib.colors.Normalize(vmin=vmin, vmax=vmax, clip=True),
cmap=seaborn.color_palette(cmap, as_cmap=True)
)
def toHexRGB(color):
(r, g, b, _) = mapper.to_rgba(color)
return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
return toHexRGB
def showGraph(adjacency, filePath, cmap='Blues'):
edgeValues = [x for row in adjacency for x in row if x is not None]
color = colorize(min(edgeValues), max(edgeValues), cmap)
g = graphviz.Digraph()
g.graph_attr['rankdir'] = 'LR'
dimension = len(adjacency)
for i in range(0, dimension):
g.node(str(i), label=data.shortDomain(data.domains[i]))
for i in range(0, dimension):
for j in range(0, len(adjacency[i])):
link = adjacency[i][j]
if link is not None:
label = f"{link}" if type(link) == int else f"{link:.2f}"
g.edge(str(i), str(j), color=color(link), label=label)
#return Image(filename=g.render(preparePath(filePath), format='png'))
return g.render(preparePath(filePath), format='png')
def heatmap(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs):
plot.figure(figsize=(16,13))
if 'cmap' not in kwargs:
kwargs['cmap'] = 'Blues'
ax = seaborn.heatmap(
matrix, xticklabels=domains, yticklabels=domains, **kwargs
)
plot.savefig(filePath, dpi=300, bbox_inches='tight')
def topValue(delta, ticks=5):
i = 0
if delta < 3:
return topValue(10*delta, ticks)/10
elif delta > 100:
return topValue(delta/10, ticks)*10
elif delta > 50:
return topValue(delta/5, ticks)*5
else:
delta = ceil(delta)
while (delta + i) % ticks > 0:
i += 1
return delta + i
def histogram(keys, values, filePath, minValue=0, maxValue=None):
fig, ax = plot.subplots(figsize = (18, 6))
maxValue = max(values) if maxValue is None else maxValue
delta = topValue(maxValue - minValue)
yTop = minValue + (1.1*delta)
yTicks = numpy.arange(minValue, yTop, delta / 5)
ax.hlines(y=yTicks, xmin=-1, xmax=len(keys)-1, color="#bfbfbf", lw=0.6)
ax.set_xlim(-0.5, len(keys) - 0.5)
ax.set_ylim(0, yTop)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
bars = ax.bar(keys, values, width=0.5)
for bar in bars :
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height + (delta/30),
'%.4g' % height, ha='center', va='bottom', fontsize=12, rotation=90)
plot.xticks(fontsize=12, rotation=45, ha='right')
plot.yticks(yTicks, fontsize=12)
fig.savefig(filePath, bbox_inches = 'tight', dpi=150)
(use-modules ((gnu packages python-science) #:select (python-pandas)) (use-modules ((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-matplotlib ((gnu packages python-xyz) #:select (python-matplotlib
python-nltk python-nltk
python-numpy
python-seaborn)) python-seaborn))
((gnu packages graphviz) #:select (graphviz python-graphviz)) ((gnu packages graphviz) #:select (graphviz python-graphviz))
(guix gexp) (guix gexp)
...@@ -24,6 +25,7 @@ ...@@ -24,6 +25,7 @@
python-graphviz python-graphviz
python-matplotlib python-matplotlib
python-nltk python-nltk
python-numpy
python-pandas python-pandas
python-seaborn python-seaborn
)) ))
......
%% Cell type:markdown id:11511929 tags: %% Cell type:markdown id:11511929 tags:
# Confusion Matrices # Lexical similarities
We want to study the n-grams similarities between the domain: a visual way to achieve this is to represent these similarities by confusion matrices, which is the format we used to visualize the errors of our models and will hence provide a base for comparison.
We start by including the EDdA modules from the [project's gitlab](https://gitlab.liris.cnrs.fr/geode/EDdA-Classification). We start by including the EDdA modules from the [project's gitlab](https://gitlab.liris.cnrs.fr/geode/EDdA-Classification).
%% Cell type:code id:a5f3d434 tags: %% Cell type:code id:a5f3d434 tags:
``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python ``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
from EDdA import data from EDdA import data
from EDdA.store import preparePath from EDdA.store import preparePath
from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams
import os import os
``` ```
%% Cell type:markdown id:4c3064ea tags: %% Cell type:markdown id:4c3064ea tags:
Then we load the training set into a new data structure called a `Source`, which contains a `pandas` `Dataframe` and a hash computed from the list of exact articles "coordinates" (volume and article number, and their order matters) contained in the original tsv file. Then we load the training set into a new data structure called a `Source`, which contains a `pandas` `Dataframe` and a hash computed from the list of exact articles "coordinates" (volume and article number, and their order matters) contained in the original tsv file.
%% Cell type:code id:5ad65685 tags: %% Cell type:code id:5ad65685 tags:
``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python ``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
source = data.load('training_set') source = data.load('training_set')
``` ```
%% Cell type:markdown id:4079559f tags: %% Cell type:markdown id:4079559f tags:
We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`). We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`), generating a PNG confusion matrix for each combination.
%% Cell type:code id:b39c5be0 tags: %% Cell type:code id:b39c5be0 tags:
``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python ``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
for n in range(1,4): for n in range(1,4):
for ranks in [10, 50, 100]: for ranks in [10, 50, 100]:
vectorizer = topNGrams(source, n, ranks) vectorizer = topNGrams(source, n, ranks)
for name in ['colinearity', 'keysIntersection']: for name in ['colinearity', 'keysIntersection']:
imagePath = preparePath(f"confusionMatrix/{source.hash}/{n}grams_top{ranks}_{name}.png") imagePath = preparePath(f"confusionMatrix/{source.hash}/{n}grams_top{ranks}_{name}.png")
toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath) heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)
``` ```
......
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
from EDdA import data from EDdA import data
from EDdA.store import preparePath from EDdA.store import preparePath
from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams
import os
import sys import sys
def __syntax(this): def __syntax(this):
...@@ -25,7 +24,7 @@ def __compute(sourcePath, ns, ranksToTry, metricNames, root): ...@@ -25,7 +24,7 @@ def __compute(sourcePath, ns, ranksToTry, metricNames, root):
vectorizer = topNGrams(source, n, ranks) vectorizer = topNGrams(source, n, ranks)
for name in metricNames: for name in metricNames:
imagePath = preparePath(f"{path}/{n}grams_top{ranks}_{name}.png") imagePath = preparePath(f"{path}/{n}grams_top{ranks}_{name}.png")
toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath) heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)
if __name__ == '__main__': if __name__ == '__main__':
argc = len(sys.argv) argc = len(sys.argv)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment