diff --git a/lib/cache.py b/lib/cache.py new file mode 100644 index 0000000000000000000000000000000000000000..4a96f83bbcdcebfc00b24cbee1149e58f16a9a27 --- /dev/null +++ b/lib/cache.py @@ -0,0 +1,39 @@ +import json +import os +import os.path + +class Cache: + ROOT = "cache" + + def filePath(symbolicPath): + return "{root}/{path}".format(root=Cache.ROOT, path=symbolicPath) + + def __init__(self, loader, pathPolicy=lambda *args:str(args) + , serializer=None, unserializer=None): + self.RAM = {} + self.loader = loader + self.pathPolicy = pathPolicy + self.serializer = serializer + self.unserializer = unserializer + + def __call__(self, *args): + symbolicPath = self.pathPolicy(*args) + self.heat(symbolicPath) + if symbolicPath not in self.RAM: + self.RAM[symbolicPath] = self.loader(*args) + self.save(symbolicPath) + return self.RAM[symbolicPath] + + def heat(self, symbolicPath): + if self.unserializer and symbolicPath not in self.RAM: + path = Cache.filePath(symbolicPath) + if os.path.isfile(path): + with open(path, 'r') as f: + self.RAM[symbolicPath] = self.unserializer(f) + + def save(self, symbolicPath): + if self.serializer and symbolicPath in self.RAM: + path = Cache.filePath(symbolicPath) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + self.serializer(self.RAM[symbolicPath], f) diff --git a/lib/data.py b/lib/data.py new file mode 100644 index 0000000000000000000000000000000000000000..74b919caf8ee18fbd4c53a301845c3d786f6e205 --- /dev/null +++ b/lib/data.py @@ -0,0 +1,34 @@ +from hashlib import md5 +import pandas +from os.path import isfile + +class Source: + def __init__(self, articles): + self.articles = articles + articleIDs = articles.volume.map(str) + '-' + articles.numero.map(str) + self.hash = md5(':'.join(articleIDs).encode('ascii')).hexdigest() + +def load(name, textColumn="contentWithoutClass", + classColumn="ensemble_domaine_enccre"): + fileName = name if isfile(name) else "datasets/{name}.tsv".format(name=name) + return Source(pandas.read_csv(fileName, sep='\t')\ + .dropna(subset=[classColumn])\ + .reset_index(drop=True) + ) + +domains = [ + 'Agriculture - Economie rustique', 'Anatomie', 'Antiquité', + 'Architecture', 'Arts et métiers', 'Beaux-arts', + 'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse', + 'Chimie', 'Commerce', 'Droit - Jurisprudence', + 'Economie domestique', 'Grammaire', 'Géographie', 'Histoire', + 'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège', + 'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme', + 'Minéralogie', 'Monnaie', 'Musique', 'Médailles', + 'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie', + 'Physique - [Sciences physico-mathématiques]', 'Politique', + 'Pêche', 'Religion', 'Spectacle', 'Superstition' + ] + +def domain(articles, name): + return articles[articles.ensemble_domaine_enccre == name] diff --git a/lib/model.py b/lib/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b05d95a1f116bb634ca6282aca1ae1f6b6436a48 --- /dev/null +++ b/lib/model.py @@ -0,0 +1,20 @@ +import pickle +import sklearn + +def vectorizerFileName(name, samplingSize): + return "{name}_s{samplingSize}".format(name=name, samplingSize=samplingSize) + +def vectorizer(name, samplingSize=10000): + filePath = "models/{fileName}.pkl".format( + fileName=vectorizerFileName(name, samplingSize) + ) + with open(filePath, 'rb') as file: + return pickle.load(file) + +def classifier(name, vectorizerName, samplingSize=10000): + filePath = "models/{name}_{vectorizer}.pkl".format( + name=name, + vectorizer=vectorizerFileName(vectorizerName, samplingSize) + ) + with open(filePath, 'rb') as file: + return pickle.load(file) diff --git a/lib/topNGrams.py b/lib/topNGrams.py new file mode 100644 index 0000000000000000000000000000000000000000..ed33b93224fdfa9babbd988a7a575cc7a7fd893a --- /dev/null +++ b/lib/topNGrams.py @@ -0,0 +1,92 @@ +from cache import Cache +import data +import nltk +import pandas +import results +import sys + +def frequenciesLoader(articles, n, domain): + texts = data.domain(articles, domain).contentWithoutClass + state = {} + for text in texts: + ngrams = list(nltk.ngrams(text.split(), n)) + for k in ngrams: + state[k] = 1+(state[k] if k in state else 0) + return state + +def frequenciesPath(inputHash, n): + return lambda domain:\ + "frequencies/{inputHash}/{n}grams/{domain}.csv"\ + .format(inputHash=inputHash, n=n, domain=domain) + +def loadFrequencies(f): + csv = pandas.read_csv(f, sep='\t') + return dict(zip( + csv.ngram.map(lambda s: tuple(s.split(','))), + csv.frequency + )) + +def saveFrequencies(data, f): + pandas.DataFrame(data={ + 'ngram': map(lambda t: ','.join(t), data.keys()), + 'frequency': data.values() + }).to_csv(f, sep='\t', index=False) + +def frequencies(source, n): + return Cache( + lambda domain: frequenciesLoader(source.articles, n, domain), + pathPolicy=frequenciesPath(source.hash, n), + serializer=saveFrequencies, + unserializer=loadFrequencies + ) + +def topLoader(frequencyEvaluator, n, ranks): + return lambda domain:\ + dict(nltk.FreqDist(frequencyEvaluator(n, domain)).most_common(ranks)) + +def topPath(inputHash, n, ranks): + return lambda domain:\ + "topNGrams/{inputHash}/{n}grams/top{ranks}/{domain}.csv".format( + inputHash=inputHash, + n=n, + ranks=ranks, + domain=domain + ) + +def topNGrams(source, n, ranks): + freq = frequencies(source, n) + return Cache( + topLoader(freq, n, ranks), + pathPolicy=topPath(source.hash, n, ranks), + serializer=saveFrequencies, + unserializer=loadFrequencies + ) + +def __syntax(this): + print( + "Syntax: {this} {required} {optional}".format( + this=this, + required="ARTICLES_DATA(.csv)", + optional="[NGRAM SIZE] [TOP_RANKS_SIZE] [DOMAIN]" + ), + file=sys.stderr + ) + sys.exit(1) + +def __compute(articlesSource, ns, ranksToTry, domains): + for n in ns: + for ranks in ranksToTry: + cached = topNGrams(data.load(articlesSource), n, ranks) + for domain in domains: + cached(domain) + +if __name__ == '__main__': + argc = len(sys.argv) + if argc < 2: + __syntax(sys.argv[0]) + else: + articlesSource = sys.argv[1] + ns = [int(sys.argv[2])] if argc > 2 else range(1,4) + ranksToTry = [int(sys.argv[3])] if argc > 3 else [10, 100, 50] + domains = [sys.argv[4]] if argc > 4 else data.domains + __compute(articlesSource, ns, ranksToTry, domains)