parallel-links.py

#!/usr/bin/env python3

import csv
import sys

def EDdARow(columns):
    return {
            'tome': columns[0],
            'article': columns[1],
            'head': columns[2],
            }

def LGERow(tome):
    return lambda columns: {
            'id': columns[0],
            'tome': tome,
            'rank': columns[1],
            'head': columns[2]
        }

def unserializeMetadata(path, rowReader):
    with open(path) as f:
        inputFile = csv.reader(f)
        articles = []
        header = True
        for row in inputFile:
            if header:
                header = False
            else:
                yield rowReader(row)

def concat(generators):
    for g in generators:
        for x in g:
            yield x

def naiveIndexBy(field, elements):
    d = {}
    for e in elements:
        key = e[field]
        if key in d:
            d[key] = None
        else:
            d[key] = e
    return d

def growPrefixes(d, keys, maxLength):
    for length in range(maxLength, 1, -1):
        newGeneration = {}
        for key in keys:
            if len(key) == length:
                newKey = key[:-1]
                if newKey in newGeneration or newKey in d:
                    newGeneration[newKey] = None
                else:
                    newGeneration[newKey] = d[key]
        for key, value in newGeneration.items():
            if value is not None:
                d[key] = value
                keys.add(key)
            elif key not in d:
                d[key] = value

def indexBy(field, elements, prefix=True):
    d = {}
    for e in elements:
        key = e[field]
        if key in d:
            d[key] = None
        else:
            d[key] = e
    if prefix:
        keys = set(d.keys())
        growPrefixes(d, keys, max(map(len, keys)))
    return d

def headWords(head):
    words = head.split()
    if len(words) == 1:
        return words
    else:
        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]

def identify(head, haystack):
    if head in haystack:
        if haystack[head] is not None:
            return {'type': 'exact', 'match': head, 'found': haystack[head]}
        else:
            return None
    else:
        prefix = head[:-1]
        while len(prefix) > 0 and prefix not in haystack:
            prefix = prefix[:-1]
        if prefix in haystack and haystack[prefix] is not None:
            return {
                    'type': 'prefix',
                    'match': head,
                    'found': haystack[prefix],
                    'precision': len(prefix) / len(head)
                    }
        else:
            return None

def naiveGetArrows(source, target):
    indexedSource = naiveIndexBy('head', source)
    indexedTarget = naiveIndexBy('head', target)
    for head, article in indexedSource.items():
        if article is not None and head in indexedTarget and indexedTarget[head] is not None:
            yield {
                    'source': article,
                    'target': indexedTarget[head]
                    }

def getArrows(source, target):
    for article in source:
        heads = headWords(article['head'])
        identified = map(lambda w: identify(w, target), heads)
        entries = [e for e in identified if e is not None]
        if len(entries) == 1:
            yield {
                    'type': 'match',
                    'source': article,
                    'target': entries[0]
                    }
        elif len(entries) > 1:
            yield {
                    'type': 'ambiguity',
                    'source': article,
                    'target': entries
                    }

def interesting(arrow):
    if arrow['type'] == 'match':
        target = arrow['target']
        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)

#gold = [a for a in arrows if interesting(a)]

def getMetadata(arrows, path=None):
    output = sys.stdout if path is None else open(path, 'w')
    toCsv = csv.writer(output, lineterminator='\n')
    toCsv.writerow(
            ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE']
            )
    for arrow in arrows:
        toCsv.writerow([
            arrow['target']['head'],
            arrow['target']['id'],
            arrow['source']['tome'],
            arrow['source']['article'],
            arrow['target']['tome'],
            arrow['target']['rank']
            ])

if __name__ == '__main__':
    edda = list(unserializeMetadata(sys.argv[1], EDdARow))
    lge = list(concat([
        unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
    ]))
    getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])