Skip to content
Snippets Groups Projects
Commit ec79f583 authored by Alice Brenon's avatar Alice Brenon
Browse files

Draft the basis of lib to expose the fundamental concepts around the project

parent 2b2d9d72
No related branches found
No related tags found
No related merge requests found
import json
import os
import os.path
class Cache:
ROOT = "cache"
def filePath(symbolicPath):
return "{root}/{path}".format(root=Cache.ROOT, path=symbolicPath)
def __init__(self, loader, pathPolicy=lambda *args:str(args)
, serializer=None, unserializer=None):
self.RAM = {}
self.loader = loader
self.pathPolicy = pathPolicy
self.serializer = serializer
self.unserializer = unserializer
def __call__(self, *args):
symbolicPath = self.pathPolicy(*args)
self.heat(symbolicPath)
if symbolicPath not in self.RAM:
self.RAM[symbolicPath] = self.loader(*args)
self.save(symbolicPath)
return self.RAM[symbolicPath]
def heat(self, symbolicPath):
if self.unserializer and symbolicPath not in self.RAM:
path = Cache.filePath(symbolicPath)
if os.path.isfile(path):
with open(path, 'r') as f:
self.RAM[symbolicPath] = self.unserializer(f)
def save(self, symbolicPath):
if self.serializer and symbolicPath in self.RAM:
path = Cache.filePath(symbolicPath)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
self.serializer(self.RAM[symbolicPath], f)
from hashlib import md5
import pandas
from os.path import isfile
class Source:
def __init__(self, articles):
self.articles = articles
articleIDs = articles.volume.map(str) + '-' + articles.numero.map(str)
self.hash = md5(':'.join(articleIDs).encode('ascii')).hexdigest()
def load(name, textColumn="contentWithoutClass",
classColumn="ensemble_domaine_enccre"):
fileName = name if isfile(name) else "datasets/{name}.tsv".format(name=name)
return Source(pandas.read_csv(fileName, sep='\t')\
.dropna(subset=[classColumn])\
.reset_index(drop=True)
)
domains = [
'Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
'Architecture', 'Arts et métiers', 'Beaux-arts',
'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',
'Chimie', 'Commerce', 'Droit - Jurisprudence',
'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',
'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',
'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',
'Minéralogie', 'Monnaie', 'Musique', 'Médailles',
'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',
'Physique - [Sciences physico-mathématiques]', 'Politique',
'Pêche', 'Religion', 'Spectacle', 'Superstition'
]
def domain(articles, name):
return articles[articles.ensemble_domaine_enccre == name]
import pickle
import sklearn
def vectorizerFileName(name, samplingSize):
return "{name}_s{samplingSize}".format(name=name, samplingSize=samplingSize)
def vectorizer(name, samplingSize=10000):
filePath = "models/{fileName}.pkl".format(
fileName=vectorizerFileName(name, samplingSize)
)
with open(filePath, 'rb') as file:
return pickle.load(file)
def classifier(name, vectorizerName, samplingSize=10000):
filePath = "models/{name}_{vectorizer}.pkl".format(
name=name,
vectorizer=vectorizerFileName(vectorizerName, samplingSize)
)
with open(filePath, 'rb') as file:
return pickle.load(file)
from cache import Cache
import data
import nltk
import pandas
import results
import sys
def frequenciesLoader(articles, n, domain):
texts = data.domain(articles, domain).contentWithoutClass
state = {}
for text in texts:
ngrams = list(nltk.ngrams(text.split(), n))
for k in ngrams:
state[k] = 1+(state[k] if k in state else 0)
return state
def frequenciesPath(inputHash, n):
return lambda domain:\
"frequencies/{inputHash}/{n}grams/{domain}.csv"\
.format(inputHash=inputHash, n=n, domain=domain)
def loadFrequencies(f):
csv = pandas.read_csv(f, sep='\t')
return dict(zip(
csv.ngram.map(lambda s: tuple(s.split(','))),
csv.frequency
))
def saveFrequencies(data, f):
pandas.DataFrame(data={
'ngram': map(lambda t: ','.join(t), data.keys()),
'frequency': data.values()
}).to_csv(f, sep='\t', index=False)
def frequencies(source, n):
return Cache(
lambda domain: frequenciesLoader(source.articles, n, domain),
pathPolicy=frequenciesPath(source.hash, n),
serializer=saveFrequencies,
unserializer=loadFrequencies
)
def topLoader(frequencyEvaluator, n, ranks):
return lambda domain:\
dict(nltk.FreqDist(frequencyEvaluator(n, domain)).most_common(ranks))
def topPath(inputHash, n, ranks):
return lambda domain:\
"topNGrams/{inputHash}/{n}grams/top{ranks}/{domain}.csv".format(
inputHash=inputHash,
n=n,
ranks=ranks,
domain=domain
)
def topNGrams(source, n, ranks):
freq = frequencies(source, n)
return Cache(
topLoader(freq, n, ranks),
pathPolicy=topPath(source.hash, n, ranks),
serializer=saveFrequencies,
unserializer=loadFrequencies
)
def __syntax(this):
print(
"Syntax: {this} {required} {optional}".format(
this=this,
required="ARTICLES_DATA(.csv)",
optional="[NGRAM SIZE] [TOP_RANKS_SIZE] [DOMAIN]"
),
file=sys.stderr
)
sys.exit(1)
def __compute(articlesSource, ns, ranksToTry, domains):
for n in ns:
for ranks in ranksToTry:
cached = topNGrams(data.load(articlesSource), n, ranks)
for domain in domains:
cached(domain)
if __name__ == '__main__':
argc = len(sys.argv)
if argc < 2:
__syntax(sys.argv[0])
else:
articlesSource = sys.argv[1]
ns = [int(sys.argv[2])] if argc > 2 else range(1,4)
ranksToTry = [int(sys.argv[3])] if argc > 3 else [10, 100, 50]
domains = [sys.argv[4]] if argc > 4 else data.domains
__compute(articlesSource, ns, ranksToTry, domains)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment