Split visualization functions into a new module

030d12c2 · Alice Brenon · 85986ba9 · 030d12c2 · 030d12c2 · 030d12c2
Commit 030d12c2 authored 3 years ago by Alice Brenon
--- a/EDdA/classification/__init__.py
+++ b/EDdA/classification/__init__.py
 from EDdA.classification.nGramsFrequencies import topNGrams
 from EDdA.classification.classSimilarities \
-        import colinearity, confusionMatrix, keysIntersection, toPNG, metrics
+        import colinearity, confusionMatrix, keysIntersection, metrics
+from EDdA.classification.visualization import heatmap, histogram, showGraph
--- a/EDdA/classification/classSimilarities.py
+++ b/EDdA/classification/classSimilarities.py
 from EDdA import data
 import math
-import matplotlib.pyplot as plot
-import seaborn
 def keysIntersection(d1, d2):
    return len(set(d1).intersection(d2))
@@ -32,12 +30,3 @@ def confusionMatrix(vectorizer, metric, domains=data.domains):
        for b in range(0, matrixSize):
            m[a][b] = metric(vectorizer(domains[a]), vectorizer(domains[b]))
    return m
-def toPNG(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs):
-    plot.figure(figsize=(16,13))
-    if 'cmap' not in kwargs:
-        kwargs['cmap'] = 'Blues'
-    ax = seaborn.heatmap(
-            matrix, xticklabels=domains, yticklabels=domains, **kwargs
-        )
-    plot.savefig(filePath, dpi=300, bbox_inches='tight')
--- a/EDdA/classification/visualization.py
+++ b/EDdA/classification/visualization.py
+from EDdA import data
+from EDdA.store import preparePath
+import graphviz
+from math import ceil
+import matplotlib.cm as cm
+import matplotlib
+import matplotlib.pyplot as plot
+import numpy
+import seaborn
+def colorize(vmin, vmax, cmap):
+    mapper = cm.ScalarMappable(
+            norm=matplotlib.colors.Normalize(vmin=vmin, vmax=vmax, clip=True),
+            cmap=seaborn.color_palette(cmap, as_cmap=True)
+        )
+    def toHexRGB(color):
+        (r, g, b, _) = mapper.to_rgba(color)
+        return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
+    return toHexRGB
+def showGraph(adjacency, filePath, cmap='Blues'):
+    edgeValues = [x for row in adjacency for x in row if x is not None]
+    color = colorize(min(edgeValues), max(edgeValues), cmap)
+    g = graphviz.Digraph()
+    g.graph_attr['rankdir'] = 'LR'
+    dimension = len(adjacency)
+    for i in range(0, dimension):
+        g.node(str(i), label=data.shortDomain(data.domains[i]))
+    for i in range(0, dimension):
+        for j in range(0, len(adjacency[i])):
+            link = adjacency[i][j]
+            if link is not None:
+                label = f"{link}" if type(link) == int else f"{link:.2f}"
+                g.edge(str(i), str(j), color=color(link), label=label)
+    #return Image(filename=g.render(preparePath(filePath), format='png'))
+    return g.render(preparePath(filePath), format='png')
+def heatmap(matrix, filePath, domains=list(map(data.shortDomain, data.domains)), **kwargs):
+    plot.figure(figsize=(16,13))
+    if 'cmap' not in kwargs:
+        kwargs['cmap'] = 'Blues'
+    ax = seaborn.heatmap(
+            matrix, xticklabels=domains, yticklabels=domains, **kwargs
+        )
+    plot.savefig(filePath, dpi=300, bbox_inches='tight')
+def topValue(delta, ticks=5):
+    i = 0
+    if delta < 3:
+        return topValue(10*delta, ticks)/10
+    elif delta > 100:
+        return topValue(delta/10, ticks)*10
+    elif delta > 50:
+        return topValue(delta/5, ticks)*5
+    else:
+        delta = ceil(delta)
+        while (delta + i) % ticks > 0:
+            i += 1
+        return delta + i
+def histogram(keys, values, filePath, minValue=0, maxValue=None):
+    fig, ax = plot.subplots(figsize = (18, 6))
+    maxValue = max(values) if maxValue is None else maxValue
+    delta = topValue(maxValue - minValue)
+    yTop = minValue + (1.1*delta)
+    yTicks = numpy.arange(minValue, yTop, delta / 5)
+    ax.hlines(y=yTicks, xmin=-1, xmax=len(keys)-1, color="#bfbfbf", lw=0.6)
+    ax.set_xlim(-0.5, len(keys) - 0.5)
+    ax.set_ylim(0, yTop)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_visible(False)
+    bars = ax.bar(keys, values, width=0.5)
+    for bar in bars :
+      height = bar.get_height()
+      ax.text(bar.get_x() + bar.get_width()/2., height + (delta/30),
+                    '%.4g' % height, ha='center', va='bottom', fontsize=12, rotation=90)
+    plot.xticks(fontsize=12, rotation=45, ha='right')
+    plot.yticks(yTicks, fontsize=12)
+    fig.savefig(filePath, bbox_inches = 'tight', dpi=150)
--- a/guix.scm
+++ b/guix.scm
 (use-modules ((gnu packages python-science) #:select (python-pandas))
             ((gnu packages python-xyz) #:select (python-matplotlib
                                                  python-nltk
+                                                  python-numpy
                                                  python-seaborn))
             ((gnu packages graphviz) #:select (graphviz python-graphviz))
             (guix gexp)
@@ -24,6 +25,7 @@
            python-graphviz
            python-matplotlib
            python-nltk
+            python-numpy
            python-pandas
            python-seaborn
            ))

--- a/notebooks/Confusion_Matrices.ipynb
+++ b/notebooks/Confusion_Matrices.ipynb
@@ -5,7 +5,9 @@
   "id": "11511929",
   "metadata": {},
   "source": [
-    "# Confusion Matrices\n",
+    "# Lexical similarities\n",
+    "\n",
+    "We want to study the n-grams similarities between the domain: a visual way to achieve this is to represent these similarities by confusion matrices, which is the format we used to visualize the errors of our models and will hence provide a base for comparison.\n",
    "\n",
    "We start by including the EDdA modules from the [project's gitlab](https://gitlab.liris.cnrs.fr/geode/EDdA-Classification)."
   ]
@@ -19,7 +21,7 @@
   "source": [
    "from EDdA import data\n",
    "from EDdA.store import preparePath\n",
-    "from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams\n",
+    "from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams\n",
    "import os"
   ]
  },
@@ -46,7 +48,7 @@
   "id": "4079559f",
   "metadata": {},
   "source": [
-    "We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`)."
+    "We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`), generating a PNG confusion matrix for each combination."
   ]
  },
  {
@@ -61,14 +63,14 @@
    "        vectorizer = topNGrams(source, n, ranks)\n",
    "        for name in ['colinearity', 'keysIntersection']:\n",
    "            imagePath = preparePath(f\"confusionMatrix/{source.hash}/{n}grams_top{ranks}_{name}.png\")\n",
-    "            toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath)"
+    "            heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
-   "language": "/gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python",
+   "language": "/gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python",
   "name": "python3"
  },
  "language_info": {

 %% Cell type:markdown id:11511929 tags:
-# Confusion Matrices
+# Lexical similarities
+We want to study the n-grams similarities between the domain: a visual way to achieve this is to represent these similarities by confusion matrices, which is the format we used to visualize the errors of our models and will hence provide a base for comparison.
 We start by including the EDdA modules from the [project's gitlab](https://gitlab.liris.cnrs.fr/geode/EDdA-Classification).
 %% Cell type:code id:a5f3d434 tags:
-``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python
+``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
 from EDdA import data
 from EDdA.store import preparePath
-from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams
+from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams
 import os
 ```
 %% Cell type:markdown id:4c3064ea tags:
 Then we load the training set into a new data structure called a `Source`, which contains a `pandas` `Dataframe` and a hash computed from the list of exact articles "coordinates" (volume and article number, and their order matters) contained in the original tsv file.
 %% Cell type:code id:5ad65685 tags:
-``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python
+``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
 source = data.load('training_set')
 ```
 %% Cell type:markdown id:4079559f tags:
-We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`).
+We loop on the n-gram size (`n`), the number of `ranks` to keep when computing the most frequent ones and the comparison method (the metrics' `name`), generating a PNG confusion matrix for each combination.
 %% Cell type:code id:b39c5be0 tags:
-``` /gnu/store/2rpsj69fzmcnafz4rml0blrynfayxqzr-python-wrapper-3.9.9/bin/python
+``` /gnu/store/fby6l226w8kh2mwkzpjpajmgy0q1kxli-python-wrapper-3.9.9/bin/python
 for n in range(1,4):
    for ranks in [10, 50, 100]:
        vectorizer = topNGrams(source, n, ranks)
        for name in ['colinearity', 'keysIntersection']:
            imagePath = preparePath(f"confusionMatrix/{source.hash}/{n}grams_top{ranks}_{name}.png")
-            toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath)
+            heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)
 ```

--- a/scripts/confusionMatrices.py
+++ b/scripts/confusionMatrices.py
@@ -2,8 +2,7 @@
 from EDdA import data
 from EDdA.store import preparePath
-from EDdA.classification import confusionMatrix, metrics, toPNG, topNGrams
+from EDdA.classification import confusionMatrix, heatmap, metrics, topNGrams
-import os
 import sys
 def __syntax(this):
@@ -25,7 +24,7 @@ def __compute(sourcePath, ns, ranksToTry, metricNames, root):
            vectorizer = topNGrams(source, n, ranks)
            for name in metricNames:
                imagePath = preparePath(f"{path}/{n}grams_top{ranks}_{name}.png")
-                toPNG(confusionMatrix(vectorizer, metrics[name]), imagePath)
+                heatmap(confusionMatrix(vectorizer, metrics[name]), imagePath)
 if __name__ == '__main__':
    argc = len(sys.argv)