#!/usr/bin/env python3 import csv import sys def EDdARow(columns): return { 'tome': columns[0], 'article': columns[1], 'head': columns[2], } def LGERow(tome): return lambda columns: { 'id': columns[0], 'tome': tome, 'rank': columns[1], 'head': columns[2] } def unserializeMetadata(path, rowReader): with open(path) as f: inputFile = csv.reader(f) articles = [] header = True for row in inputFile: if header: header = False else: yield { 'book': row[0] } #def concat(generators): # for g in generators: # for x in g: # yield x def naiveIndexBy(field, elements): d = {} for e in elements: key = e[field] if key in d: d[key] = None else: d[key] = e return d def naiveGetArrows(source, target): indexedSource = naiveIndexBy('head', source) indexedTarget = naiveIndexBy('head', target) for head, article in indexedSource.items(): if article is not None and head in indexedTarget and indexedTarget[head] is not None: yield { 'source': article, 'target': indexedTarget[head] } #def growPrefixes(d, keys, maxLength): # for length in range(maxLength, 1, -1): # newGeneration = {} # for key in keys: # if len(key) == length: # newKey = key[:-1] # if newKey in newGeneration or newKey in d: # newGeneration[newKey] = None # else: # newGeneration[newKey] = d[key] # for key, value in newGeneration.items(): # if value is not None: # d[key] = value # keys.add(key) # elif key not in d: # d[key] = value #def indexBy(field, elements, prefix=True): # d = {} # for e in elements: # key = e[field] # if key in d: # d[key] = None # else: # d[key] = e # if prefix: # keys = set(d.keys()) # growPrefixes(d, keys, max(map(len, keys))) # return d #def headWords(head): # words = head.split() # if len(words) == 1: # return words # else: # return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()] #def identify(head, haystack): # if head in haystack: # if haystack[head] is not None: # return {'type': 'exact', 'match': head, 'found': haystack[head]} # else: # return None # else: # prefix = head[:-1] # while len(prefix) > 0 and prefix not in haystack: # prefix = prefix[:-1] # if prefix in haystack and haystack[prefix] is not None: # return { # 'type': 'prefix', # 'match': head, # 'found': haystack[prefix], # 'precision': len(prefix) / len(head) # } # else: # return None # #def getArrows(source, target): # for article in source: # heads = headWords(article['head']) # identified = map(lambda w: identify(w, target), heads) # entries = [e for e in identified if e is not None] # if len(entries) == 1: # yield { # 'type': 'match', # 'source': article, # 'target': entries[0] # } # elif len(entries) > 1: # yield { # 'type': 'ambiguity', # 'source': article, # 'target': entries # } #def interesting(arrow): # if arrow['type'] == 'match': # target = arrow['target'] # return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8) #gold = [a for a in arrows if interesting(a)] def getMetadata(arrows, path=None): output = sys.stdout if path is None else open(path, 'w') toCsv = csv.writer(output, lineterminator='\n') toCsv.writerow( ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE'] ) for arrow in arrows: toCsv.writerow([ arrow['target']['head'], arrow['target']['id'], arrow['source']['tome'], arrow['source']['article'], arrow['target']['tome'], arrow['target']['rank'] ]) if __name__ == '__main__': edda = list(unserializeMetadata(sys.argv[1], EDdARow)) lge = list(concat([ unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32) ])) getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])