-
Alice Brenon authored1e28d593
parallel-links.py 4.75 KiB
#!/usr/bin/env python3
import csv
import sys
def EDdARow(columns):
return {
'tome': columns[0],
'article': columns[1],
'head': columns[2],
}
def LGERow(tome):
return lambda columns: {
'id': columns[0],
'tome': tome,
'rank': columns[1],
'head': columns[2]
}
def unserializeMetadata(path, rowReader):
with open(path) as f:
inputFile = csv.reader(f)
articles = []
header = True
for row in inputFile:
if header:
header = False
else:
yield {
'book': row[0]
}
#def concat(generators):
# for g in generators:
# for x in g:
# yield x
def naiveIndexBy(field, elements):
d = {}
for e in elements:
key = e[field]
if key in d:
d[key] = None
else:
d[key] = e
return d
def naiveGetArrows(source, target):
indexedSource = naiveIndexBy('head', source)
indexedTarget = naiveIndexBy('head', target)
for head, article in indexedSource.items():
if article is not None and head in indexedTarget and indexedTarget[head] is not None:
yield {
'source': article,
'target': indexedTarget[head]
}
#def growPrefixes(d, keys, maxLength):
# for length in range(maxLength, 1, -1):
# newGeneration = {}
# for key in keys:
# if len(key) == length:
# newKey = key[:-1]
# if newKey in newGeneration or newKey in d:
# newGeneration[newKey] = None
# else:
# newGeneration[newKey] = d[key]
# for key, value in newGeneration.items():
# if value is not None:
# d[key] = value
# keys.add(key)
# elif key not in d:
# d[key] = value
#def indexBy(field, elements, prefix=True):
# d = {}
# for e in elements:
# key = e[field]
# if key in d:
# d[key] = None
# else:
# d[key] = e
# if prefix:
# keys = set(d.keys())
# growPrefixes(d, keys, max(map(len, keys)))
# return d
#def headWords(head):
# words = head.split()
# if len(words) == 1:
# return words
# else:
# return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
#def identify(head, haystack):
# if head in haystack:
# if haystack[head] is not None:
# return {'type': 'exact', 'match': head, 'found': haystack[head]}
# else:
# return None
# else:
# prefix = head[:-1]
# while len(prefix) > 0 and prefix not in haystack:
# prefix = prefix[:-1]
# if prefix in haystack and haystack[prefix] is not None:
# return {
# 'type': 'prefix',
# 'match': head,
# 'found': haystack[prefix],
# 'precision': len(prefix) / len(head)
# }
# else:
# return None
#
#def getArrows(source, target):
# for article in source:
# heads = headWords(article['head'])
# identified = map(lambda w: identify(w, target), heads)
# entries = [e for e in identified if e is not None]
# if len(entries) == 1:
# yield {
# 'type': 'match',
# 'source': article,
# 'target': entries[0]
# }
# elif len(entries) > 1:
# yield {
# 'type': 'ambiguity',
# 'source': article,
# 'target': entries
# }
#def interesting(arrow):
# if arrow['type'] == 'match':
# target = arrow['target']
# return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
#gold = [a for a in arrows if interesting(a)]
def getMetadata(arrows, path=None):
output = sys.stdout if path is None else open(path, 'w')
toCsv = csv.writer(output, lineterminator='\n')
toCsv.writerow(
['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE']
)
for arrow in arrows:
toCsv.writerow([
arrow['target']['head'],
arrow['target']['id'],
arrow['source']['tome'],
arrow['source']['article'],
arrow['target']['tome'],
arrow['target']['rank']
])
if __name__ == '__main__':
edda = list(unserializeMetadata(sys.argv[1], EDdARow))
lge = list(concat([
unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
]))
getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])