Skip to content
Snippets Groups Projects
topNGrams.py 1.04 KiB
#!/usr/bin/env python3

from EDdA import data
from EDdA.classification import topNGrams
import sys

def __syntax(this):
    print(
            "Syntax: {this} {required} {optional}".format(
                    this=this,
                    required="ARTICLES_DATA(.tsv)",
                    optional="[NGRAM SIZE] [TOP_RANKS_SIZE] [DOMAIN]"
                ),
            file=sys.stderr
        )
    sys.exit(1)

def __populateCache(articlesSource, ns, ranksToTry, domains):
    for n in ns:
        for ranks in ranksToTry:
            cached = topNGrams(data.load(articlesSource), n, ranks)
            for domain in domains:
                cached(domain)

if __name__ == '__main__':
    argc = len(sys.argv)
    if argc < 2:
        __syntax(sys.argv[0])
    else:
        articlesSource = sys.argv[1]
        ns = [int(sys.argv[2])] if argc > 2 else range(1,4)
        ranksToTry = [int(sys.argv[3])] if argc > 3 else [10, 100, 50]
        domains = [sys.argv[4]] if argc > 4 else data.domains
        __populateCache(articlesSource, ns, ranksToTry, domains)