Skip to content
Snippets Groups Projects
Commit 6ae31c0f authored by Alice Brenon's avatar Alice Brenon
Browse files

Add NGrams and Stopwords functions to the GEODE.Classification module

parent 25558a3e
No related branches found
No related tags found
No related merge requests found
from GEODE.Metadata import articleKey, toKey
from GEODE.Classification.Stopwords import isStopWord
from GEODE.Store import tabular
import nltk
from string import punctuation
tools = {'lemmatize': None, 'spacyFr': None}
def fullLemmas(text):
if tools['lemmatize'] is None:
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
tools['lemmatize'] = FrenchLefffLemmatizer().lemmatize
if tools['spacyFr'] is None:
import spacy
tools['spacyFr'] = spacy.load("fr_core_news_sm")
tokens = [token.text.lower() for token in tools['spacyFr'](text)]
return [tools['lemmatize'](token) for token in tokens
if not isStopWord(token) and token not in punctuation]
def frequencies(corpus, domains, n):
metadata = domains.set_index(articleKey)
byDomain = {d: {} for d in sorted(domains['domain'].unique())}
for text in corpus.get_all():
ngrams = list(nltk.ngrams(text['content'].split(), n))
key = toKey(text)
if key in metadata.index:
state = byDomain[metadata.loc[key]['domain']]
for k in ngrams:
state[k] = 1+(state[k] if k in state else 0)
return {k: nltk.FreqDist(v) for k,v in byDomain.items()}
def saveFrequencies(freqDist, f, top=100):
data = pandas.DataFrame(freqDist.most_common(top),
columns=['ngram', 'frequency'])
data.to_csv(f, sep='\t', index=False)
def loadFrequencies(f, top=None):
data = tabular(f)
if top is not None:
data = data.head(top)
return dict(zip(data.ngram, data.frequency))
import nltk
nltk_stopwords = set(nltk.corpus.stopwords.words('french'))
geode_stopwords = {'plus', 'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit',
'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme',
'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit',
'selon', 'tout', 'étoient', 'faire', 'lon', 'celle', 'ainsi',
'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux',
'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', 'un',
'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit',
'neuf', 'dix', 'très', 'plus', 'ni', 'fit', 'parce', 'dire',
'douze', 'toutes', 'après', "l'", "qu'", "s'", "c'", "d'",
"n'", "j'", "m'", "t'", "jusqu'", "lorsqu'", "puisqu'",
"quoiqu'"}
def isStopWord(word, fromNLTK=True, fromGEODE=True):
return (fromNLTK and word in nltk_stopwords) or \
(fromGEODE and word in geode_stopwords)
from GEODE.Classification.NGrams import fullLemmas
from GEODE.Classification.Stopwords import isStopWord
domainGroups = [
'Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
'Architecture', 'Arts et métiers', 'Beaux-arts',
......
......@@ -18,7 +18,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from argparse import ArgumentParser
from GEODE.Classification import superdomains as domains
from GEODE.Classification import fullLemmas, isStopWord, superdomains as domains
from GEODE.ENE import eneLabels
from GEODE.Metadata import article, articleKey, paragraph, paragraphKey, \
fromKey, relativePath, toKey, uid
......
......@@ -3,6 +3,9 @@
((gnu packages python-xyz) #:select (python-matplotlib
python-nltk
python-seaborn))
((geode packages models) #:select (nltk-data-corpora-stopwords
python-frenchleffflemmatizer
spacy-fr-core-news-sm))
(guix gexp)
(guix git-download)
((guix licenses) #:select (lgpl3+))
......@@ -20,11 +23,15 @@
#:select? (git-predicate %source-dir)))
(build-system pyproject-build-system)
(propagated-inputs
(list python-matplotlib
(list nltk-data-corpora-stopwords
python-frenchleffflemmatizer
python-matplotlib
python-nltk
python-pandas
python-scikit-learn
python-seaborn
python-spacy))
python-spacy
spacy-fr-core-news-sm))
(arguments
(list #:tests? #f))
(home-page "https://gitlab.liris.cnrs.fr/geode/geopyck")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment