Add NGrams and Stopwords functions to the GEODE.Classification module

6ae31c0f · Alice Brenon · 25558a3e · 6ae31c0f · 6ae31c0f · 6ae31c0f
Commit 6ae31c0f authored 1 year ago by Alice Brenon
--- a/GEODE/Classification/NGrams.py
+++ b/GEODE/Classification/NGrams.py
+from GEODE.Metadata import articleKey, toKey
+from GEODE.Classification.Stopwords import isStopWord
+from GEODE.Store import tabular
+import nltk
+from string import punctuation
+
+tools = {'lemmatize': None, 'spacyFr': None}
+
+def fullLemmas(text):
+    if tools['lemmatize'] is None:
+        from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
+        tools['lemmatize'] = FrenchLefffLemmatizer().lemmatize
+    if tools['spacyFr'] is None:
+        import spacy
+        tools['spacyFr'] = spacy.load("fr_core_news_sm")
+    tokens = [token.text.lower() for token in tools['spacyFr'](text)]
+    return [tools['lemmatize'](token) for token in tokens
+            if not isStopWord(token) and token not in punctuation]
+
+def frequencies(corpus, domains, n):
+    metadata = domains.set_index(articleKey)
+    byDomain = {d: {} for d in sorted(domains['domain'].unique())}
+    for text in corpus.get_all():
+        ngrams = list(nltk.ngrams(text['content'].split(), n))
+        key = toKey(text)
+        if key in metadata.index:
+            state = byDomain[metadata.loc[key]['domain']]
+            for k in ngrams:
+                state[k] = 1+(state[k] if k in state else 0)
+    return {k: nltk.FreqDist(v) for k,v in byDomain.items()}
+
+def saveFrequencies(freqDist, f, top=100):
+    data = pandas.DataFrame(freqDist.most_common(top),
+                            columns=['ngram', 'frequency'])
+    data.to_csv(f, sep='\t', index=False)
+
+def loadFrequencies(f, top=None):
+    data = tabular(f)
+    if top is not None:
+        data = data.head(top)
+    return dict(zip(data.ngram, data.frequency))
--- a/GEODE/Classification/Stopwords.py
+++ b/GEODE/Classification/Stopwords.py
+import nltk
+
+nltk_stopwords = set(nltk.corpus.stopwords.words('french'))
+geode_stopwords = {'plus',  'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit',
+                   'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme',
+                   'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit',
+                   'selon', 'tout', 'étoient', 'faire', 'lon', 'celle', 'ainsi',
+                   'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux',
+                   'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', 'un',
+                   'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit',
+                   'neuf', 'dix', 'très', 'plus', 'ni', 'fit', 'parce', 'dire',
+                   'douze', 'toutes', 'après', "l'", "qu'", "s'", "c'", "d'",
+                   "n'", "j'", "m'", "t'", "jusqu'", "lorsqu'", "puisqu'",
+                   "quoiqu'"}
+
+def isStopWord(word, fromNLTK=True, fromGEODE=True):
+    return (fromNLTK and word in nltk_stopwords) or \
+           (fromGEODE and word in geode_stopwords)
--- a/GEODE/Classification.py
+++ b/GEODE/Classification.py
+from GEODE.Classification.NGrams import fullLemmas
+from GEODE.Classification.Stopwords import isStopWord
+
 domainGroups = [
           'Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
           'Architecture', 'Arts et métiers', 'Beaux-arts',

--- a/GEODE/__init__.py
+++ b/GEODE/__init__.py
@@ -18,7 +18,7 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

 from argparse import ArgumentParser
-from GEODE.Classification import superdomains as domains
+from GEODE.Classification import fullLemmas, isStopWord, superdomains as domains
 from GEODE.ENE import eneLabels
 from GEODE.Metadata import article, articleKey, paragraph, paragraphKey, \
        fromKey, relativePath, toKey, uid

--- a/guix.scm
+++ b/guix.scm
@@ -3,6 +3,9 @@
             ((gnu packages python-xyz) #:select (python-matplotlib
                                                  python-nltk
                                                  python-seaborn))
+             ((geode packages models) #:select (nltk-data-corpora-stopwords
+                                                python-frenchleffflemmatizer
+                                                spacy-fr-core-news-sm))
             (guix gexp)
             (guix git-download)
             ((guix licenses) #:select (lgpl3+))
@@ -20,11 +23,15 @@
                  #:select? (git-predicate %source-dir)))
    (build-system pyproject-build-system)
    (propagated-inputs
-      (list python-matplotlib
+      (list nltk-data-corpora-stopwords
+            python-frenchleffflemmatizer
+            python-matplotlib
+            python-nltk
            python-pandas
            python-scikit-learn
            python-seaborn
-            python-spacy))
+            python-spacy
+            spacy-fr-core-news-sm))
    (arguments
     (list #:tests? #f))
    (home-page "https://gitlab.liris.cnrs.fr/geode/geopyck")

--- a/requirements.txt
+++ b/requirements.txt
 matplotlib
+nltk
 pandas 
+frenchleffflemmatizer
 scikit-learn
 seaborn
 spacy