#!/usr/bin/env python3 import csv import sys def EDdARow(columns): return { 'tome': columns[0], 'article': columns[1], 'head': columns[2], } def LGERow(tome): return lambda columns: { 'id': columns[0], 'tome': tome, 'rank': columns[1], 'head': columns[2] } def unserializeMetadata(path, rowReader): with open(path) as f: inputFile = csv.reader(f) articles = [] header = True for row in inputFile: if header: header = False else: yield rowReader(row) def concat(generators): for g in generators: for x in g: yield x def naiveIndexBy(field, elements): d = {} for e in elements: key = e[field] if key in d: d[key] = None else: d[key] = e return d def growPrefixes(d, keys, maxLength): for length in range(maxLength, 1, -1): newGeneration = {} for key in keys: if len(key) == length: newKey = key[:-1] if newKey in newGeneration or newKey in d: newGeneration[newKey] = None else: newGeneration[newKey] = d[key] for key, value in newGeneration.items(): if value is not None: d[key] = value keys.add(key) elif key not in d: d[key] = value def indexBy(field, elements, prefix=True): d = {} for e in elements: key = e[field] if key in d: d[key] = None else: d[key] = e if prefix: keys = set(d.keys()) growPrefixes(d, keys, max(map(len, keys))) return d def headWords(head): words = head.split() if len(words) == 1: return words else: return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()] def identify(head, haystack): if head in haystack: if haystack[head] is not None: return {'type': 'exact', 'match': head, 'found': haystack[head]} else: return None else: prefix = head[:-1] while len(prefix) > 0 and prefix not in haystack: prefix = prefix[:-1] if prefix in haystack and haystack[prefix] is not None: return { 'type': 'prefix', 'match': head, 'found': haystack[prefix], 'precision': len(prefix) / len(head) } else: return None def naiveGetArrows(source, target): indexedSource = naiveIndexBy('head', source) indexedTarget = naiveIndexBy('head', target) for head, article in indexedSource.items(): if article is not None and head in indexedTarget and indexedTarget[head] is not None: yield { 'source': article, 'target': indexedTarget[head] } def getArrows(source, target): for article in source: heads = headWords(article['head']) identified = map(lambda w: identify(w, target), heads) entries = [e for e in identified if e is not None] if len(entries) == 1: yield { 'type': 'match', 'source': article, 'target': entries[0] } elif len(entries) > 1: yield { 'type': 'ambiguity', 'source': article, 'target': entries } def interesting(arrow): if arrow['type'] == 'match': target = arrow['target'] return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8) #gold = [a for a in arrows if interesting(a)] def getMetadata(arrows, path=None): output = sys.stdout if path is None else open(path, 'w') toCsv = csv.writer(output, lineterminator='\n') toCsv.writerow( ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE'] ) for arrow in arrows: toCsv.writerow([ arrow['target']['head'], arrow['target']['id'], arrow['source']['tome'], arrow['source']['article'], arrow['target']['tome'], arrow['target']['rank'] ]) if __name__ == '__main__': edda = list(unserializeMetadata(sys.argv[1], EDdARow)) lge = list(concat([ unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32) ])) getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])