From bfa8aeb4acb1c3f030e1465490f22b996e95a0b3 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 8 Dec 2021 22:04:14 +0100 Subject: [PATCH] Generalize the levenshtein implementation to be able to use it on both the tokens and the named-entities lists --- README.md | 19 ++++---- comparator.py | 42 ----------------- ene_comparator/__init__.py | 2 - ene_comparator/editDistance.py | 34 -------------- ene_comparator/sequence.py | 65 --------------------------- guix.scm | 31 +++++++++++++ ne-score | 24 ++++++++++ ne_score/__init__.py | 0 ne_score/cartesianCost.py | 39 ++++++++++++++++ ne_score/generalLevenshtein.py | 81 +++++++++++++++++++++++++++++++++ ne_score/namedEntity.py | 82 ++++++++++++++++++++++++++++++++++ ne_score/sequence.py | 49 ++++++++++++++++++++ ne_score/tokens.py | 37 +++++++++++++++ setup.py | 8 ++-- 14 files changed, 358 insertions(+), 155 deletions(-) delete mode 100755 comparator.py delete mode 100644 ene_comparator/__init__.py delete mode 100644 ene_comparator/editDistance.py delete mode 100644 ene_comparator/sequence.py create mode 100644 guix.scm create mode 100755 ne-score create mode 100644 ne_score/__init__.py create mode 100644 ne_score/cartesianCost.py create mode 100644 ne_score/generalLevenshtein.py create mode 100644 ne_score/namedEntity.py create mode 100644 ne_score/sequence.py create mode 100644 ne_score/tokens.py diff --git a/README.md b/README.md index 91b52d8..e79e7b1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ -# ENE comparator +# NE score -Run the `./comparator.py` script. The weights of the edit distance can be -customized with the following parameters (see `--help` to learn more): +Rate a named-entity annotator against a truth reference. -- `--missingENE` -- `--missingType` -- `--wrongType` -- `--namesWeight` -- `--otherWeight` +The `ne-score` script expects two arguments in this order: +- the path to the XML-TEI file containing the predictions +- the path to the XML-TEI file containing the truth + +For now it simply displays the named-entity found in the prediction and truth, +then the diff computed on both. + +TODO: convert this information in terms of precision/recall, and break it down +according to each named-entity type (Person, Date or Place). diff --git a/comparator.py b/comparator.py deleted file mode 100755 index d1f44ad..0000000 --- a/comparator.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/python3 - -import argparse -from ene_comparator import eNESequence, Metric -from sys import argv - -def getArgs(): - parser = argparse.ArgumentParser() - parser.add_argument("leftFile", help="path to the first file to compare") - parser.add_argument("rightFile", help="path to the second file to compare") - parser.add_argument("--missingENE", help="weight for a missing ENE", - type=float) - parser.add_argument("--missingType", help="weight for an ENE with a missing type", - type=float) - parser.add_argument("--wrongType", help="weight for an ENE with the wrong type", - type=float) - parser.add_argument("--namesWeight", help="weight for missing \"import\" words (hopefully proper nouns, but technically just not articles or prepositions)", - type=float) - parser.add_argument("--otherWeight", help="weight for other missing words (hopefull only articles or prepositions)", - type=float) - return parser.parse_args() - -def getMetric(args): - missingENE = args.missingENE or 10. - missingType = args.missingType or 5. - wrongType = args.wrongType or 5. - namesWeight = args.namesWeight or 0. - otherWeight = args.otherWeight or 0. - return Metric(missingENE, missingType, wrongType, namesWeight, otherWeight) - -def main(): - args = getArgs() - print(args) - m = getMetric(args) - leftSequence = eNESequence(args.leftFile) - rightSequence = eNESequence(args.rightFile) - print([str(s) for s in leftSequence]) - print([str(s) for s in rightSequence]) - print(m.differences(leftSequence, rightSequence)) - -if __name__ == '__main__': - main() diff --git a/ene_comparator/__init__.py b/ene_comparator/__init__.py deleted file mode 100644 index 41ea9c2..0000000 --- a/ene_comparator/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .editDistance import Metric -from .sequence import eNESequence diff --git a/ene_comparator/editDistance.py b/ene_comparator/editDistance.py deleted file mode 100644 index 8ec907f..0000000 --- a/ene_comparator/editDistance.py +++ /dev/null @@ -1,34 +0,0 @@ -def scalar(d1, d2): - s = 0 - for k in set(d1.keys()).intersection(d2.keys()): - s += d1[k] * d2[k] - return s - -class Metric: - def __init__(self, missingENE, missingType, wrongType, namesWeight, - otherWeight): - self.weight = { - 'missingENE': missingENE, - 'missingType': missingType, - 'wrongType': wrongType, - 'namesWeight': namesWeight, - 'otherWeight': otherWeight - } - - def differences(self, l1, l2): - return self._differences(l1, 0, l2, 0) - - def _differences(self, l1, i1, l2, i2): - missingENE = self.weight['missingENE'] - if i1 >= len(l1): - return (len(l2)-i2) * missingENE - elif i2 >= len(l2): - return (len(l1)-i1) * missingENE - else: - delta = scalar(l1[i1].diff(l2[i2]), self.weight) - return min( - missingENE + self._differences(l1, i1+1, l2, i2), - missingENE + self._differences(l1, i1, l2, i2+1), - delta + self._differences(l1, i1+1, l2, i2+1) - ) - diff --git a/ene_comparator/sequence.py b/ene_comparator/sequence.py deleted file mode 100644 index 3500a7f..0000000 --- a/ene_comparator/sequence.py +++ /dev/null @@ -1,65 +0,0 @@ -from lxml import etree -import re - -tokenizer = re.compile("[- ']") - -def getTokens(words): - return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] - -class ENE: - def __init__(self, eNEType, wElems): - self.eNEType = eNEType - self.tokens = getTokens([w.text for w in wElems]) - - def diff(self, other): - missingType = self.eNEType is None or other.eNEType is None - return { - 'missingType': missingType, - 'wrongType': not missingType and self.eNEType != other.eNEType - # TODO: write a levenshtein for tokens with namesWeight and - # otherWeight - } - - def __str__(self): - return "|{tokens}|({eNEType})".format( - tokens=','.join(self.tokens), - eNEType=self.eNEType - ) - -def eNEFromPerdidoElement(e): - ws = e.xpath('.//w') - wsType = None - for w in ws: - if 'type' in w.attrib: - wsType = w.attrib['type'] - if (e.attrib['type'], wsType) == ('place', 'NPr'): - return ENE('NP_Spatial', ws) - else: - return ENE(None, ws) - -def eNEFromTEIWAElement(e): - if e.tag == 'w': - words = [e] - else: - words = e.xpath('./w') - return ENE(e.attrib['enc_tags'], words) - -def fromPerdido(dom): - return map(eNEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]')) - -def fromTEIWA(dom): - elements = dom.xpath('//*[contains(@enc_tags, "NP_")]') - return map(eNEFromTEIWAElement, elements) - -def eNESequence(filePath): - try: - with open(filePath) as f: - try: - dom = etree.parse(f) - except etree.XMLSyntaxError: - return None - rootTag = dom.getroot().tag - sequencer = fromPerdido if rootTag == 'tei.2' else fromTEIWA - return list(sequencer(dom)) - except FileNotFoundError: - return None diff --git a/guix.scm b/guix.scm new file mode 100644 index 0000000..577180c --- /dev/null +++ b/guix.scm @@ -0,0 +1,31 @@ +(use-modules (gnu packages xml) + (guix build-system python) + (guix download) + (guix licenses) + (guix packages)) + +(package + (name "ne-score") + (version "0.1.0") + (source + (origin + (method url-fetch) + (uri ".") + (file-name (string-append name "-" version)) + (sha256 + (base32 "0000000000000000000000000000000000000000000000000000")))) + (build-system python-build-system) + (propagated-inputs + `(("python-lxml" ,python-lxml))) + (home-page "https://gitlab.liris.cnrs.fr/abrenon/ne-score") + (synopsis "A python tool to rate a named-entity annotator.") + (description + "This tool detects the named-entities in XML-TEI files and computes a diff + on them to be able to measure the precision and recall of a named-entities + annotator. + + The diff is computed by a generalization of the Levenshtein distance on + lists expecting a cost function to describe the edition process and which + can return an arbitrary (comparable) type. The implementation returns both + the cost and the edition path.") + (license bsd-3)) diff --git a/ne-score b/ne-score new file mode 100755 index 0000000..fbd47e8 --- /dev/null +++ b/ne-score @@ -0,0 +1,24 @@ +#!/usr/bin/python3 + +import argparse +from ne_score.sequence import nESequence +import ne_score.namedEntity as NE +from sys import argv + +def getArgs(): + parser = argparse.ArgumentParser() + parser.add_argument("prediction", help="path to the first file to compare") + parser.add_argument("truth", help="path to the second file to compare") + return parser.parse_args() + +def main(): + args = getArgs() + predicted = nESequence(args.prediction) + truth = nESequence(args.truth) + print([str(s) for s in predicted]) + print([str(s) for s in truth]) + diff = NE.levenshtein.diff(predicted, truth) + print(str(diff)) + +if __name__ == '__main__': + main() diff --git a/ne_score/__init__.py b/ne_score/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ne_score/cartesianCost.py b/ne_score/cartesianCost.py new file mode 100644 index 0000000..3849fb3 --- /dev/null +++ b/ne_score/cartesianCost.py @@ -0,0 +1,39 @@ +class CartesianCost: + def __init__(self, **kwargs): + if 'infinite' in kwargs: + self.infinite = True + else: + self.infinite = False + self.factors = {} + for k in self.__class__.ranking(): + self.factors[k] = kwargs[k] if k in kwargs else 0 + def ranking(): + raise NotImplementedError("Must override ranking") + def getCosts(self): + if self.infinite: + return (True,) + else: + factors = [self.factors[k] for k in self.__class__.ranking()] + return (False, tuple(factors)) + def __add__(self, other): + if self.infinite or other.infinite: + return self.__class__(infinite=True) + else: + d = dict([ + (k, self.factors[k] + other.factors[k]) + for k in self.__class__.ranking() + ]) + return self.__class__(**d) + def __lt__(self, other): + return self.getCosts() < other.getCosts() + def __le__(self, other): + return self.getCosts() <= other.getCosts() + def __gt__(self, other): + return self.getCosts() > other.getCosts() + def __ge__(self, other): + return self.getCosts() >= other.getCosts() + def __str__(self): + if self.infinite: + return "infinite" + else: + return str(dict([(k, str(self.factors[k])) for k in self.factors])) diff --git a/ne_score/generalLevenshtein.py b/ne_score/generalLevenshtein.py new file mode 100644 index 0000000..bf4130b --- /dev/null +++ b/ne_score/generalLevenshtein.py @@ -0,0 +1,81 @@ +from functools import reduce + +class Update: + def __init__(self, initial, final): + self.initial = initial + self.final = final + def __str__(self): + return str(self.initial) + " -> " + str(self.final) + +class Edition: + def __init__(self, sign, element): + self.sign = sign + self.element = element + def __str__(self): + return self.sign + str(self.element) + +def Insert(x): + return Edition('+', x) + +def Delete(x): + return Edition('-', x) + +def plus(a, b): + if a is None: + return b + elif b is None: + return a + else: + return a + b + +class Diff: + def __init__(self, cost, changes): + self.cost = cost + self.changes = changes + def __str__(self): + return "{diff}({cost})".format( + diff=list(map(str, self.changes)), + cost=str(self.cost) + ) + def __add__(self, other): + return Diff(plus(self.cost, other.cost), self.changes + other.changes) + def __lt__(self, other): + return other.cost is not None and (self.cost is None or self.cost < other.cost) + def __le__(self, other): + return self.cost is None or (other.cost is not None and self.cost <= other.cost) + def __gt__(self, other): + return self.cost is not None and (other.cost is None or self.cost > other.cost) + def __ge__(self, other): + return other.cost is None or (self.cost is not None and self.cost >= other.cost) + +Diff.empty = Diff(None, []) + +class GLevenshtein: + def __init__(self, cost): + self.cost = cost + def rate(self, operation): + cost = self.cost(operation) + return Diff.empty if cost is None else Diff(cost, [operation]) + def diff(self, l1, l2): + return self._diff(l1, 0, l2, 0) + def _diff(self, l1, i1, l2, i2): + if i1 >= len(l1): + if len(l2) == i2: + return self.rate(None) + else: + return reduce(plus, map(self.rate, map(Insert, l2[i2:]))) + elif i2 >= len(l2): + if len(l1) == i1: + return self.rate(None) + else: + return reduce(plus, map(self.rate, map(Delete, l1[i1:]))) + else: + if l1[i1] == l2[i2]: + delta = Diff.empty + else: + delta = self.rate(Update(l1[i1], l2[i2])) + return min( + (self.rate(Delete(l1[i1])) + self._diff(l1, i1+1, l2, i2)), + (self.rate(Insert(l2[i2])) + self._diff(l1, i1, l2, i2+1)), + (delta + self._diff(l1, i1+1, l2, i2+1)) + ) diff --git a/ne_score/namedEntity.py b/ne_score/namedEntity.py new file mode 100644 index 0000000..daa438f --- /dev/null +++ b/ne_score/namedEntity.py @@ -0,0 +1,82 @@ +from ne_score.cartesianCost import CartesianCost +from ne_score.generalLevenshtein import Diff, Edition, Update, GLevenshtein +import ne_score.tokens as tokens + +class Cost(CartesianCost): + def ranking(): + return ['missingOrExtra', 'wrongType', 'missingType', 'tokensDiff'] + +class NE: + def __init__(self, NEType, wElems): + self.NEType = NEType + self.tokens = tokens.get([w.text for w in wElems]) + def __str__(self): + return "|{tokens}|({NEType})".format( + tokens=','.join(self.tokens), + NEType=self.NEType + ) + def __eq__(self, other): + return self.NEType == other.NEType and self.tokens == other.tokens + +def cost(operation): + if type(operation) == Update: + initial = operation.initial + final = operation.final + if len(set(initial.tokens).intersection(final.tokens)) > 0: + missingType = final.NEType is not None and initial.NEType is None + return Cost( + wrongType=not missingType and initial.NEType != final.NEType, + missingType=missingType, + tokensDiff=tokens.levenshtein.diff( + initial.tokens, + final.tokens + ) + ) + else: + return Cost(infinite=True) + elif type(operation) == Edition: + return Cost(missingOrExtra=1, tokensDiff=Diff.empty) + else: + return None + +levenshtein = GLevenshtein(cost) + +""" + def diff(self, other): + missingType = self.eNEType is None or other.eNEType is None + return { + 'missingType': missingType, + 'wrongType': not missingType and self.eNEType != other.eNEType + # TODO: write a levenshtein for tokens with namesWeight and + # otherWeight + } +class Cost: + def __init__(self, impossible, wrongType, missingType, tokensCost): + self.impossible = impossible + self.wrongType = wrongType + self.missingType = missingType + self.tokenCosts = tokenCosts + def __add__(self, other): + return Cost( + self.impossible | other.impossible, + self.name + other.name, + self.minorWord + self.minorWord + ) + def __lt__(self, other): + selfTriple = (self.impossible, self.name, self.minorWord) + otherTriple = (other.impossible, other.name, other.minorWord) + return selfTriple < otherTriple + def __le__(self, other): + selfTriple = (self.impossible, self.name, self.minorWord) + otherTriple = (other.impossible, other.name, other.minorWord) + return selfTriple <= otherTriple + def __gt__(self, other): + selfTriple = (self.impossible, self.name, self.minorWord) + otherTriple = (other.impossible, other.name, other.minorWord) + return selfTriple > otherTriple + def __ge__(self, other): + selfTriple = (self.impossible, self.name, self.minorWord) + otherTriple = (other.impossible, other.name, other.minorWord) + return selfTriple >= otherTriple +""" + diff --git a/ne_score/sequence.py b/ne_score/sequence.py new file mode 100644 index 0000000..315d6ca --- /dev/null +++ b/ne_score/sequence.py @@ -0,0 +1,49 @@ +from lxml import etree +from ne_score.namedEntity import NE +import ne_score.namedEntity as nE +import re + +def nEFromPerdidoElement(e): + ws = e.xpath('.//w') + wsType = None + for w in ws: + if 'type' in w.attrib: + wsType = w.attrib['type'] + #if (e.attrib['type'], wsType) == ('place', 'NPr'): + if e.attrib['type'] == 'place': + return NE('Spatial', ws) + elif e.attrib['type'] == 'date': + return NE('Date', ws) + elif e.attrib['type'] == 'person': + return NE('Person', ws) + else: + return NE(None, ws) + +posPrefix = re.compile("N._") + +def nEFromTEIWAElement(e): + if e.tag == 'w': + words = [e] + else: + words = e.xpath('./w') + return NE(posPrefix.sub("", e.attrib['enc_tags']), words) + +def fromPerdido(dom): + return map(nEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]')) + +def fromTEIWA(dom): + elements = dom.xpath('//*[contains(@enc_tags, "NP_")]') + return map(nEFromTEIWAElement, elements) + +def nESequence(filePath): + try: + with open(filePath) as f: + try: + dom = etree.parse(f) + except etree.XMLSyntaxError: + return None + rootTag = dom.getroot().tag + sequencer = fromPerdido if rootTag == 'tei.2' else fromTEIWA + return list(sequencer(dom)) + except FileNotFoundError: + return None diff --git a/ne_score/tokens.py b/ne_score/tokens.py new file mode 100644 index 0000000..e189495 --- /dev/null +++ b/ne_score/tokens.py @@ -0,0 +1,37 @@ +from ne_score.generalLevenshtein import Edition, Update, GLevenshtein +from ne_score.cartesianCost import CartesianCost +import re + +tokenizer = re.compile("[- ']") + +def get(words): + return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] + +determiners = { + "un", "une", "des" + "le", "la", "l'", "les", + } + +prepositions = { + "à ", "de", "dans", "pour", "vers", "près", "en", "lès", "sur", "sous", + "au", "aux", "du", + } + +smallWords = determiners.union(prepositions) + +class Cost(CartesianCost): + def ranking(): + return ['names', 'smallWords'] + +def cost(operation): + if type(operation) == Update: + return Cost(infinite=True) + elif type(operation) == Edition: + if operation.element in smallWords: + return Cost(otherWords=1) + else: + return Cost(names=1) + else: + return None + +levenshtein = GLevenshtein(cost) diff --git a/setup.py b/setup.py index 5633a7a..f9bb05d 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,14 @@ from distutils.core import setup setup( - name="ENE-comparator", + name="NE-score", version="0.1.0", description="yeah, maybe", author="Alice Brenon", author_email="alice.brenon@liris.cnrs.fr", - url="https://gitlab.liris.cnrs.fr/abrenon/ene-comparator", - packages=["ene_comparator"], + url="https://gitlab.liris.cnrs.fr/abrenon/ne-score", + packages=["ne_score"], install_requires=['lxml'], - scripts=['comparator.py'], + scripts=['ne-score'], ) -- GitLab