-
Alice Brenon authoredf8b70bc4
parallel-links.py 4.62 KiB
#!/usr/bin/env python3
import csv
import sys
def EDdARow(columns):
return {
'tome': columns[0],
'article': columns[1],
'head': columns[2],
}
def LGERow(tome):
return lambda columns: {
'id': columns[0],
'tome': tome,
'rank': columns[1],
'head': columns[2]
}
def unserializeMetadata(path, rowReader):
with open(path) as f:
inputFile = csv.reader(f)
articles = []
header = True
for row in inputFile:
if header:
header = False
else:
yield rowReader(row)
def concat(generators):
for g in generators:
for x in g:
yield x
def naiveIndexBy(field, elements):
d = {}
for e in elements:
key = e[field]
if key in d:
d[key] = None
else:
d[key] = e
return d
def growPrefixes(d, keys, maxLength):
for length in range(maxLength, 1, -1):
newGeneration = {}
for key in keys:
if len(key) == length:
newKey = key[:-1]
if newKey in newGeneration or newKey in d:
newGeneration[newKey] = None
else:
newGeneration[newKey] = d[key]
for key, value in newGeneration.items():
if value is not None:
d[key] = value
keys.add(key)
elif key not in d:
d[key] = value
def indexBy(field, elements, prefix=True):
d = {}
for e in elements:
key = e[field]
if key in d:
d[key] = None
else:
d[key] = e
if prefix:
keys = set(d.keys())
growPrefixes(d, keys, max(map(len, keys)))
return d
def headWords(head):
words = head.split()
if len(words) == 1:
return words
else:
return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
def identify(head, haystack):
if head in haystack:
if haystack[head] is not None:
return {'type': 'exact', 'match': head, 'found': haystack[head]}
else:
return None
else:
prefix = head[:-1]
while len(prefix) > 0 and prefix not in haystack:
prefix = prefix[:-1]
if prefix in haystack and haystack[prefix] is not None:
return {
'type': 'prefix',
'match': head,
'found': haystack[prefix],
'precision': len(prefix) / len(head)
}
else:
return None
def naiveGetArrows(source, target):
indexedSource = naiveIndexBy('head', source)
indexedTarget = naiveIndexBy('head', target)
for head, article in indexedSource.items():
if article is not None and head in indexedTarget and indexedTarget[head] is not None:
yield {
'source': article,
'target': indexedTarget[head]
}
def getArrows(source, target):
for article in source:
heads = headWords(article['head'])
identified = map(lambda w: identify(w, target), heads)
entries = [e for e in identified if e is not None]
if len(entries) == 1:
yield {
'type': 'match',
'source': article,
'target': entries[0]
}
elif len(entries) > 1:
yield {
'type': 'ambiguity',
'source': article,
'target': entries
}
def interesting(arrow):
if arrow['type'] == 'match':
target = arrow['target']
return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
#gold = [a for a in arrows if interesting(a)]
def getMetadata(arrows, path=None):
output = sys.stdout if path is None else open(path, 'w')
toCsv = csv.writer(output, lineterminator='\n')
toCsv.writerow(
['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE']
)
for arrow in arrows:
toCsv.writerow([
arrow['target']['head'],
arrow['target']['id'],
arrow['source']['tome'],
arrow['source']['article'],
arrow['target']['tome'],
arrow['target']['rank']
])
if __name__ == '__main__':
edda = list(unserializeMetadata(sys.argv[1], EDdARow))
lge = list(concat([
unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
]))
getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])