diff --git a/GEODE/Classification/NGrams.py b/GEODE/Classification/NGrams.py new file mode 100644 index 0000000000000000000000000000000000000000..eb2b6c4da96837bc1568346201e01222ea8a5188 --- /dev/null +++ b/GEODE/Classification/NGrams.py @@ -0,0 +1,41 @@ +from GEODE.Metadata import articleKey, toKey +from GEODE.Classification.Stopwords import isStopWord +from GEODE.Store import tabular +import nltk +from string import punctuation + +tools = {'lemmatize': None, 'spacyFr': None} + +def fullLemmas(text): + if tools['lemmatize'] is None: + from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer + tools['lemmatize'] = FrenchLefffLemmatizer().lemmatize + if tools['spacyFr'] is None: + import spacy + tools['spacyFr'] = spacy.load("fr_core_news_sm") + tokens = [token.text.lower() for token in tools['spacyFr'](text)] + return [tools['lemmatize'](token) for token in tokens + if not isStopWord(token) and token not in punctuation] + +def frequencies(corpus, domains, n): + metadata = domains.set_index(articleKey) + byDomain = {d: {} for d in sorted(domains['domain'].unique())} + for text in corpus.get_all(): + ngrams = list(nltk.ngrams(text['content'].split(), n)) + key = toKey(text) + if key in metadata.index: + state = byDomain[metadata.loc[key]['domain']] + for k in ngrams: + state[k] = 1+(state[k] if k in state else 0) + return {k: nltk.FreqDist(v) for k,v in byDomain.items()} + +def saveFrequencies(freqDist, f, top=100): + data = pandas.DataFrame(freqDist.most_common(top), + columns=['ngram', 'frequency']) + data.to_csv(f, sep='\t', index=False) + +def loadFrequencies(f, top=None): + data = tabular(f) + if top is not None: + data = data.head(top) + return dict(zip(data.ngram, data.frequency)) diff --git a/GEODE/Classification/Stopwords.py b/GEODE/Classification/Stopwords.py new file mode 100644 index 0000000000000000000000000000000000000000..83f4d80832977046987b0a4e86e21282feea5164 --- /dev/null +++ b/GEODE/Classification/Stopwords.py @@ -0,0 +1,18 @@ +import nltk + +nltk_stopwords = set(nltk.corpus.stopwords.words('french')) +geode_stopwords = {'plus', 'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit', + 'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme', + 'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit', + 'selon', 'tout', 'étoient', 'faire', 'lon', 'celle', 'ainsi', + 'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux', + 'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', 'un', + 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', + 'neuf', 'dix', 'très', 'plus', 'ni', 'fit', 'parce', 'dire', + 'douze', 'toutes', 'après', "l'", "qu'", "s'", "c'", "d'", + "n'", "j'", "m'", "t'", "jusqu'", "lorsqu'", "puisqu'", + "quoiqu'"} + +def isStopWord(word, fromNLTK=True, fromGEODE=True): + return (fromNLTK and word in nltk_stopwords) or \ + (fromGEODE and word in geode_stopwords) diff --git a/GEODE/Classification.py b/GEODE/Classification/__init__.py similarity index 91% rename from GEODE/Classification.py rename to GEODE/Classification/__init__.py index f33280e59c7e7f5f558d280a76188b3e478777d3..efc69663354b3cf1dd187852aefdd60895dd3ea1 100644 --- a/GEODE/Classification.py +++ b/GEODE/Classification/__init__.py @@ -1,3 +1,6 @@ +from GEODE.Classification.NGrams import fullLemmas +from GEODE.Classification.Stopwords import isStopWord + domainGroups = [ 'Agriculture - Economie rustique', 'Anatomie', 'Antiquité', 'Architecture', 'Arts et métiers', 'Beaux-arts', diff --git a/GEODE/__init__.py b/GEODE/__init__.py index e7735f5fba302172f401b79b3b2fb6223885b756..57b432410b8436aecd9682ecced85ccaabb969f4 100644 --- a/GEODE/__init__.py +++ b/GEODE/__init__.py @@ -18,7 +18,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>. """ from argparse import ArgumentParser -from GEODE.Classification import superdomains as domains +from GEODE.Classification import fullLemmas, isStopWord, superdomains as domains from GEODE.ENE import eneLabels from GEODE.Metadata import article, articleKey, paragraph, paragraphKey, \ fromKey, relativePath, toKey, uid diff --git a/guix.scm b/guix.scm index ec463d136fc27e2b01877ccda10af0d351a87109..8488c739744c29ac6b3d9df0402e8f37c812ed3c 100644 --- a/guix.scm +++ b/guix.scm @@ -3,6 +3,9 @@ ((gnu packages python-xyz) #:select (python-matplotlib python-nltk python-seaborn)) + ((geode packages models) #:select (nltk-data-corpora-stopwords + python-frenchleffflemmatizer + spacy-fr-core-news-sm)) (guix gexp) (guix git-download) ((guix licenses) #:select (lgpl3+)) @@ -20,11 +23,15 @@ #:select? (git-predicate %source-dir))) (build-system pyproject-build-system) (propagated-inputs - (list python-matplotlib + (list nltk-data-corpora-stopwords + python-frenchleffflemmatizer + python-matplotlib + python-nltk python-pandas python-scikit-learn python-seaborn - python-spacy)) + python-spacy + spacy-fr-core-news-sm)) (arguments (list #:tests? #f)) (home-page "https://gitlab.liris.cnrs.fr/geode/geopyck") diff --git a/requirements.txt b/requirements.txt index b8187358a1c9b3056e922b282bdf496801090773..cbfc58ab6df8694fc298a5a3ad4e0f9c7686ce71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ matplotlib +nltk pandas +frenchleffflemmatizer scikit-learn seaborn spacy