Draft the basis of lib to expose the fundamental concepts around the project

ec79f583 · Alice Brenon · 2b2d9d72 · ec79f583 · ec79f583 · ec79f583
Commit ec79f583 authored 3 years ago by Alice Brenon
--- a/lib/cache.py
+++ b/lib/cache.py
+import json
+import os
+import os.path
+
+class Cache:
+    ROOT = "cache"
+
+    def filePath(symbolicPath):
+        return "{root}/{path}".format(root=Cache.ROOT, path=symbolicPath)
+
+    def __init__(self, loader, pathPolicy=lambda *args:str(args)
+            , serializer=None, unserializer=None):
+        self.RAM = {}
+        self.loader = loader
+        self.pathPolicy = pathPolicy
+        self.serializer = serializer
+        self.unserializer = unserializer
+
+    def __call__(self, *args):
+        symbolicPath = self.pathPolicy(*args)
+        self.heat(symbolicPath)
+        if symbolicPath not in self.RAM:
+            self.RAM[symbolicPath] = self.loader(*args)
+            self.save(symbolicPath)
+        return self.RAM[symbolicPath]
+
+    def heat(self, symbolicPath):
+        if self.unserializer and symbolicPath not in self.RAM:
+            path = Cache.filePath(symbolicPath)
+            if os.path.isfile(path):
+                with open(path, 'r') as f:
+                    self.RAM[symbolicPath] = self.unserializer(f)
+
+    def save(self, symbolicPath):
+        if self.serializer and symbolicPath in self.RAM:
+            path = Cache.filePath(symbolicPath)
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, 'w') as f:
+                self.serializer(self.RAM[symbolicPath], f)
--- a/lib/data.py
+++ b/lib/data.py
+from hashlib import md5
+import pandas
+from os.path import isfile
+
+class Source:
+    def __init__(self, articles):
+        self.articles = articles
+        articleIDs = articles.volume.map(str) + '-' + articles.numero.map(str)
+        self.hash = md5(':'.join(articleIDs).encode('ascii')).hexdigest()
+
+def load(name, textColumn="contentWithoutClass",
+        classColumn="ensemble_domaine_enccre"):
+    fileName = name if isfile(name) else "datasets/{name}.tsv".format(name=name)
+    return Source(pandas.read_csv(fileName, sep='\t')\
+            .dropna(subset=[classColumn])\
+            .reset_index(drop=True)
+        )
+
+domains = [
+           'Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
+           'Architecture', 'Arts et métiers', 'Beaux-arts',
+           'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',
+           'Chimie', 'Commerce', 'Droit - Jurisprudence',
+           'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',
+           'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',
+           'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',
+           'Minéralogie', 'Monnaie', 'Musique', 'Médailles',
+           'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',
+           'Physique - [Sciences physico-mathématiques]', 'Politique',
+           'Pêche', 'Religion', 'Spectacle', 'Superstition'
+        ]
+
+def domain(articles, name):
+    return articles[articles.ensemble_domaine_enccre == name]
--- a/lib/model.py
+++ b/lib/model.py
+import pickle
+import sklearn
+
+def vectorizerFileName(name, samplingSize):
+    return "{name}_s{samplingSize}".format(name=name, samplingSize=samplingSize)
+
+def vectorizer(name, samplingSize=10000):
+    filePath = "models/{fileName}.pkl".format(
+            fileName=vectorizerFileName(name, samplingSize)
+        )
+    with open(filePath, 'rb') as file:
+      return pickle.load(file)
+
+def classifier(name, vectorizerName, samplingSize=10000):
+    filePath = "models/{name}_{vectorizer}.pkl".format(
+            name=name,
+            vectorizer=vectorizerFileName(vectorizerName, samplingSize)
+        )
+    with open(filePath, 'rb') as file:
+        return pickle.load(file)
--- a/lib/topNGrams.py
+++ b/lib/topNGrams.py
+from cache import Cache
+import data
+import nltk
+import pandas
+import results
+import sys
+
+def frequenciesLoader(articles, n, domain):
+    texts = data.domain(articles, domain).contentWithoutClass
+    state = {}
+    for text in texts:
+        ngrams = list(nltk.ngrams(text.split(), n))
+        for k in ngrams:
+            state[k] = 1+(state[k] if k in state else 0)
+    return state
+
+def frequenciesPath(inputHash, n):
+    return lambda domain:\
+        "frequencies/{inputHash}/{n}grams/{domain}.csv"\
+                .format(inputHash=inputHash, n=n, domain=domain)
+
+def loadFrequencies(f):
+    csv = pandas.read_csv(f, sep='\t')
+    return dict(zip(
+            csv.ngram.map(lambda s: tuple(s.split(','))),
+            csv.frequency
+        ))
+
+def saveFrequencies(data, f):
+    pandas.DataFrame(data={
+            'ngram': map(lambda t: ','.join(t), data.keys()),
+            'frequency': data.values()
+        }).to_csv(f, sep='\t', index=False)
+
+def frequencies(source, n):
+    return Cache(
+            lambda domain: frequenciesLoader(source.articles, n, domain),
+            pathPolicy=frequenciesPath(source.hash, n),
+            serializer=saveFrequencies,
+            unserializer=loadFrequencies
+        )
+
+def topLoader(frequencyEvaluator, n, ranks):
+    return lambda domain:\
+        dict(nltk.FreqDist(frequencyEvaluator(n, domain)).most_common(ranks))
+
+def topPath(inputHash, n, ranks):
+    return lambda domain:\
+        "topNGrams/{inputHash}/{n}grams/top{ranks}/{domain}.csv".format(
+                inputHash=inputHash,
+                n=n,
+                ranks=ranks,
+                domain=domain
+            )
+
+def topNGrams(source, n, ranks):
+    freq = frequencies(source, n)
+    return Cache(
+            topLoader(freq, n, ranks),
+            pathPolicy=topPath(source.hash, n, ranks),
+            serializer=saveFrequencies,
+            unserializer=loadFrequencies
+        )
+
+def __syntax(this):
+    print(
+            "Syntax: {this} {required} {optional}".format(
+                    this=this,
+                    required="ARTICLES_DATA(.csv)",
+                    optional="[NGRAM SIZE] [TOP_RANKS_SIZE] [DOMAIN]"
+                ),
+            file=sys.stderr
+        )
+    sys.exit(1)
+
+def __compute(articlesSource, ns, ranksToTry, domains):
+    for n in ns:
+        for ranks in ranksToTry:
+            cached = topNGrams(data.load(articlesSource), n, ranks)
+            for domain in domains:
+                cached(domain)
+
+if __name__ == '__main__':
+    argc = len(sys.argv)
+    if argc < 2:
+        __syntax(sys.argv[0])
+    else:
+        articlesSource = sys.argv[1]
+        ns = [int(sys.argv[2])] if argc > 2 else range(1,4)
+        ranksToTry = [int(sys.argv[3])] if argc > 3 else [10, 100, 50]
+        domains = [sys.argv[4]] if argc > 4 else data.domains
+        __compute(articlesSource, ns, ranksToTry, domains)