parallel-links.py

#!/usr/bin/env python3

import csv
import sys

def EDdARow(columns):
    return {
            'tome': columns[0],
            'article': columns[1],
            'head': columns[2],
            }

def LGERow(tome):
    return lambda columns: {
            'id': columns[0],
            'tome': tome,
            'rank': columns[1],
            'head': columns[2]
        }

def unserializeMetadata(path, rowReader):
    with open(path) as f:
        inputFile = csv.reader(f)
        articles = []
        header = True
        for row in inputFile:
            if header:
                header = False
            else:
                yield {
                        'book': row[0]
                        }

#def concat(generators):
#    for g in generators:
#        for x in g:
#            yield x

def naiveIndexBy(field, elements):
    d = {}
    for e in elements:
        key = e[field]
        if key in d:
            d[key] = None
        else:
            d[key] = e
    return d

def naiveGetArrows(source, target):
    indexedSource = naiveIndexBy('head', source)
    indexedTarget = naiveIndexBy('head', target)
    for head, article in indexedSource.items():
        if article is not None and head in indexedTarget and indexedTarget[head] is not None:
            yield {
                    'source': article,
                    'target': indexedTarget[head]
                    }

#def growPrefixes(d, keys, maxLength):
#    for length in range(maxLength, 1, -1):
#        newGeneration = {}
#        for key in keys:
#            if len(key) == length:
#                newKey = key[:-1]
#                if newKey in newGeneration or newKey in d:
#                    newGeneration[newKey] = None
#                else:
#                    newGeneration[newKey] = d[key]
#        for key, value in newGeneration.items():
#            if value is not None:
#                d[key] = value
#                keys.add(key)
#            elif key not in d:
#                d[key] = value

#def indexBy(field, elements, prefix=True):
#    d = {}
#    for e in elements:
#        key = e[field]
#        if key in d:
#            d[key] = None
#        else:
#            d[key] = e
#    if prefix:
#        keys = set(d.keys())
#        growPrefixes(d, keys, max(map(len, keys)))
#    return d

#def headWords(head):
#    words = head.split()
#    if len(words) == 1:
#        return words
#    else:
#        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]

#def identify(head, haystack):
#    if head in haystack:
#        if haystack[head] is not None:
#            return {'type': 'exact', 'match': head, 'found': haystack[head]}
#        else:
#            return None
#    else:
#        prefix = head[:-1]
#        while len(prefix) > 0 and prefix not in haystack:
#            prefix = prefix[:-1]
#        if prefix in haystack and haystack[prefix] is not None:
#            return {
#                    'type': 'prefix',
#                    'match': head,
#                    'found': haystack[prefix],
#                    'precision': len(prefix) / len(head)
#                    }
#        else:
#            return None
#
#def getArrows(source, target):
#    for article in source:
#        heads = headWords(article['head'])
#        identified = map(lambda w: identify(w, target), heads)
#        entries = [e for e in identified if e is not None]
#        if len(entries) == 1:
#            yield {
#                    'type': 'match',
#                    'source': article,
#                    'target': entries[0]
#                    }
#        elif len(entries) > 1:
#            yield {
#                    'type': 'ambiguity',
#                    'source': article,
#                    'target': entries
#                    }

#def interesting(arrow):
#    if arrow['type'] == 'match':
#        target = arrow['target']
#        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)

#gold = [a for a in arrows if interesting(a)]

def getMetadata(arrows, path=None):
    output = sys.stdout if path is None else open(path, 'w')
    toCsv = csv.writer(output, lineterminator='\n')
    toCsv.writerow(
            ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE']
            )
    for arrow in arrows:
        toCsv.writerow([
            arrow['target']['head'],
            arrow['target']['id'],
            arrow['source']['tome'],
            arrow['source']['article'],
            arrow['target']['tome'],
            arrow['target']['rank']
            ])

if __name__ == '__main__':
    edda = list(unserializeMetadata(sys.argv[1], EDdARow))
    lge = list(concat([
        unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
    ]))
    getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])