-
Alice Brenon authored
Restructure the project, separating the core lib from classification-related modules, and exposing the executable parts in a separate scripts dir; add a guix package definition
6cacf0e6
topNGrams.py 1.04 KiB
#!/usr/bin/env python3
from EDdA import data
from EDdA.classification import topNGrams
import sys
def __syntax(this):
print(
"Syntax: {this} {required} {optional}".format(
this=this,
required="ARTICLES_DATA(.tsv)",
optional="[NGRAM SIZE] [TOP_RANKS_SIZE] [DOMAIN]"
),
file=sys.stderr
)
sys.exit(1)
def __populateCache(articlesSource, ns, ranksToTry, domains):
for n in ns:
for ranks in ranksToTry:
cached = topNGrams(data.load(articlesSource), n, ranks)
for domain in domains:
cached(domain)
if __name__ == '__main__':
argc = len(sys.argv)
if argc < 2:
__syntax(sys.argv[0])
else:
articlesSource = sys.argv[1]
ns = [int(sys.argv[2])] if argc > 2 else range(1,4)
ranksToTry = [int(sys.argv[3])] if argc > 3 else [10, 100, 50]
domains = [sys.argv[4]] if argc > 4 else data.domains
__populateCache(articlesSource, ns, ranksToTry, domains)