diff --git a/GEODE/Classification/NGrams.py b/GEODE/Classification/NGrams.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2b6c4da96837bc1568346201e01222ea8a5188
--- /dev/null
+++ b/GEODE/Classification/NGrams.py
@@ -0,0 +1,41 @@
+from GEODE.Metadata import articleKey, toKey
+from GEODE.Classification.Stopwords import isStopWord
+from GEODE.Store import tabular
+import nltk
+from string import punctuation
+
+tools = {'lemmatize': None, 'spacyFr': None}
+
+def fullLemmas(text):
+    if tools['lemmatize'] is None:
+        from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
+        tools['lemmatize'] = FrenchLefffLemmatizer().lemmatize
+    if tools['spacyFr'] is None:
+        import spacy
+        tools['spacyFr'] = spacy.load("fr_core_news_sm")
+    tokens = [token.text.lower() for token in tools['spacyFr'](text)]
+    return [tools['lemmatize'](token) for token in tokens
+            if not isStopWord(token) and token not in punctuation]
+
+def frequencies(corpus, domains, n):
+    metadata = domains.set_index(articleKey)
+    byDomain = {d: {} for d in sorted(domains['domain'].unique())}
+    for text in corpus.get_all():
+        ngrams = list(nltk.ngrams(text['content'].split(), n))
+        key = toKey(text)
+        if key in metadata.index:
+            state = byDomain[metadata.loc[key]['domain']]
+            for k in ngrams:
+                state[k] = 1+(state[k] if k in state else 0)
+    return {k: nltk.FreqDist(v) for k,v in byDomain.items()}
+
+def saveFrequencies(freqDist, f, top=100):
+    data = pandas.DataFrame(freqDist.most_common(top),
+                            columns=['ngram', 'frequency'])
+    data.to_csv(f, sep='\t', index=False)
+
+def loadFrequencies(f, top=None):
+    data = tabular(f)
+    if top is not None:
+        data = data.head(top)
+    return dict(zip(data.ngram, data.frequency))
diff --git a/GEODE/Classification/Stopwords.py b/GEODE/Classification/Stopwords.py
new file mode 100644
index 0000000000000000000000000000000000000000..83f4d80832977046987b0a4e86e21282feea5164
--- /dev/null
+++ b/GEODE/Classification/Stopwords.py
@@ -0,0 +1,18 @@
+import nltk
+
+nltk_stopwords = set(nltk.corpus.stopwords.words('french'))
+geode_stopwords = {'plus',  'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit',
+                   'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme',
+                   'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit',
+                   'selon', 'tout', 'étoient', 'faire', 'lon', 'celle', 'ainsi',
+                   'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux',
+                   'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', 'un',
+                   'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit',
+                   'neuf', 'dix', 'très', 'plus', 'ni', 'fit', 'parce', 'dire',
+                   'douze', 'toutes', 'après', "l'", "qu'", "s'", "c'", "d'",
+                   "n'", "j'", "m'", "t'", "jusqu'", "lorsqu'", "puisqu'",
+                   "quoiqu'"}
+
+def isStopWord(word, fromNLTK=True, fromGEODE=True):
+    return (fromNLTK and word in nltk_stopwords) or \
+           (fromGEODE and word in geode_stopwords)
diff --git a/GEODE/Classification.py b/GEODE/Classification/__init__.py
similarity index 91%
rename from GEODE/Classification.py
rename to GEODE/Classification/__init__.py
index f33280e59c7e7f5f558d280a76188b3e478777d3..efc69663354b3cf1dd187852aefdd60895dd3ea1 100644
--- a/GEODE/Classification.py
+++ b/GEODE/Classification/__init__.py
@@ -1,3 +1,6 @@
+from GEODE.Classification.NGrams import fullLemmas
+from GEODE.Classification.Stopwords import isStopWord
+
 domainGroups = [
            'Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
            'Architecture', 'Arts et métiers', 'Beaux-arts',
diff --git a/GEODE/__init__.py b/GEODE/__init__.py
index e7735f5fba302172f401b79b3b2fb6223885b756..57b432410b8436aecd9682ecced85ccaabb969f4 100644
--- a/GEODE/__init__.py
+++ b/GEODE/__init__.py
@@ -18,7 +18,7 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 
 from argparse import ArgumentParser
-from GEODE.Classification import superdomains as domains
+from GEODE.Classification import fullLemmas, isStopWord, superdomains as domains
 from GEODE.ENE import eneLabels
 from GEODE.Metadata import article, articleKey, paragraph, paragraphKey, \
         fromKey, relativePath, toKey, uid
diff --git a/guix.scm b/guix.scm
index ec463d136fc27e2b01877ccda10af0d351a87109..8488c739744c29ac6b3d9df0402e8f37c812ed3c 100644
--- a/guix.scm
+++ b/guix.scm
@@ -3,6 +3,9 @@
              ((gnu packages python-xyz) #:select (python-matplotlib
                                                   python-nltk
                                                   python-seaborn))
+             ((geode packages models) #:select (nltk-data-corpora-stopwords
+                                                python-frenchleffflemmatizer
+                                                spacy-fr-core-news-sm))
              (guix gexp)
              (guix git-download)
              ((guix licenses) #:select (lgpl3+))
@@ -20,11 +23,15 @@
                   #:select? (git-predicate %source-dir)))
     (build-system pyproject-build-system)
     (propagated-inputs
-      (list python-matplotlib
+      (list nltk-data-corpora-stopwords
+            python-frenchleffflemmatizer
+            python-matplotlib
+            python-nltk
             python-pandas
             python-scikit-learn
             python-seaborn
-            python-spacy))
+            python-spacy
+            spacy-fr-core-news-sm))
     (arguments
      (list #:tests? #f))
     (home-page "https://gitlab.liris.cnrs.fr/geode/geopyck")
diff --git a/requirements.txt b/requirements.txt
index b8187358a1c9b3056e922b282bdf496801090773..cbfc58ab6df8694fc298a5a3ad4e0f9c7686ce71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 matplotlib
+nltk
 pandas 
+frenchleffflemmatizer
 scikit-learn
 seaborn
 spacy