diff --git a/scripts/textometry/measure-partition.py b/scripts/textometry/measure-partition.py new file mode 100755 index 0000000000000000000000000000000000000000..f1f06a5f78f973fa649df80b34991a60084a61f8 --- /dev/null +++ b/scripts/textometry/measure-partition.py @@ -0,0 +1,37 @@ +#!/usr/bin/env -S PYTHONPATH=lib/python python3 + +from GEODE import tabular +from GEODE.Store import prepare +import os +import pandas +import sys + +def keyOf(t): + if type(t) == tuple: + return tuple(map(lambda s: s.strip(':'), t)) + else: + return t.strip(':') + +def getStats(series): + columns = ['mean', 'std', 'min', '25%', '50%', '75%', 'max'] + return series.describe()[columns] + +def partitonStats(metric, partition): + groups = {keyOf(p[0]): p[1] for p in partition} + byGroup = {g: getStats(groups[g][metric]) for g in groups} + stats = pandas.DataFrame(byGroup).transpose() + total = [groups[g][metric].sum() for g in groups] + stats['total'] = total + cardinal = stats['total'].sum() + stats['percentage'] = [f"{round(100*t/cardinal, 2)}%" for t in total] + return stats + +def measurePartition(path, key, metric, outputPath): + measures = tabular(path) + measures['count'] = 1 + groups = measures.groupby(key) + stats = partitonStats(metric, measures.groupby(key)) + stats.to_csv(f"{outputPath}", sep='\t') + +if __name__ == '__main__': + measurePartition(*sys.argv[1:])