Generalize the levenshtein implementation to be able to use it on both the...

Generalize the levenshtein implementation to be able to use it on both the tokens and the named-entities lists

Generalize the levenshtein implementation to be able to use it on both the...
Generalize the levenshtein implementation to be able to use it on both the tokens and the named-entities lists
bfa8aeb4 · Alice Brenon · 15503ca9 · bfa8aeb4 · 15503ca9 · 15503ca9
Commit bfa8aeb4 authored 3 years ago by Alice Brenon
--- a/README.md
+++ b/README.md
-# ENE comparator
+# NE score

-Run the `./comparator.py` script. The weights of the edit distance can be
-customized with the following parameters (see `--help` to learn more):
+Rate a named-entity annotator against a truth reference.

- `--missingENE`
- `--missingType`
- `--wrongType`
- `--namesWeight`
- `--otherWeight`
+The `ne-score` script expects two arguments in this order:
+- the path to the XML-TEI file containing the predictions
+- the path to the XML-TEI file containing the truth
+
+For now it simply displays the named-entity found in the prediction and truth,
+then the diff computed on both.
+
+TODO: convert this information in terms of precision/recall, and break it down
+according to each named-entity type (Person, Date or Place).
--- a/comparator.py
+++ b/comparator.py
-#!/usr/bin/python3
-
-import argparse
-from ene_comparator import eNESequence, Metric
-from sys import argv
-
-def getArgs():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("leftFile", help="path to the first file to compare")
-    parser.add_argument("rightFile", help="path to the second file to compare")
-    parser.add_argument("--missingENE", help="weight for a missing ENE",
-                        type=float)
-    parser.add_argument("--missingType", help="weight for an ENE with a missing type",
-                        type=float)
-    parser.add_argument("--wrongType", help="weight for an ENE with the wrong type",
-                        type=float)
-    parser.add_argument("--namesWeight", help="weight for missing \"import\" words (hopefully proper nouns, but technically just not articles or prepositions)",
-                        type=float)
-    parser.add_argument("--otherWeight", help="weight for other missing words (hopefull only articles or prepositions)",
-                        type=float)
-    return parser.parse_args()
-
-def getMetric(args):
-    missingENE = args.missingENE or 10.
-    missingType = args.missingType or 5.
-    wrongType = args.wrongType or 5.
-    namesWeight = args.namesWeight or 0.
-    otherWeight = args.otherWeight or 0.
-    return Metric(missingENE, missingType, wrongType, namesWeight, otherWeight)
-
-def main():
-    args = getArgs()
-    print(args)
-    m = getMetric(args)
-    leftSequence = eNESequence(args.leftFile)
-    rightSequence = eNESequence(args.rightFile)
-    print([str(s) for s in leftSequence])
-    print([str(s) for s in rightSequence])
-    print(m.differences(leftSequence, rightSequence))
-
-if __name__ == '__main__':
-    main()
--- a/ene_comparator/__init__.py
+++ b/ene_comparator/__init__.py
-from .editDistance import Metric
-from .sequence import eNESequence
--- a/ene_comparator/editDistance.py
+++ b/ene_comparator/editDistance.py
-def scalar(d1, d2):
-    s = 0
-    for k in set(d1.keys()).intersection(d2.keys()):
-        s += d1[k] * d2[k]
-    return s
-
-class Metric:
-    def __init__(self, missingENE, missingType, wrongType, namesWeight,
-            otherWeight):
-        self.weight = {
-                'missingENE': missingENE,
-                'missingType': missingType,
-                'wrongType': wrongType,
-                'namesWeight': namesWeight,
-                'otherWeight': otherWeight
-                }
-
-    def differences(self, l1, l2):
-        return self._differences(l1, 0, l2, 0)
-
-    def _differences(self, l1, i1, l2, i2):
-        missingENE = self.weight['missingENE']
-        if i1 >= len(l1):
-            return (len(l2)-i2) * missingENE
-        elif i2 >= len(l2):
-            return (len(l1)-i1) * missingENE
-        else:
-            delta = scalar(l1[i1].diff(l2[i2]), self.weight)
-            return min(
-                    missingENE + self._differences(l1, i1+1, l2, i2),
-                    missingENE + self._differences(l1, i1, l2, i2+1),
-                    delta + self._differences(l1, i1+1, l2, i2+1)
-                    )
-            
--- a/guix.scm
+++ b/guix.scm
+(use-modules (gnu packages xml)
+			 (guix build-system python)
+			 (guix download)
+			 (guix licenses)
+			 (guix packages))
+
+(package
+  (name "ne-score")
+  (version "0.1.0")
+  (source
+	(origin
+	  (method url-fetch)
+	  (uri ".")
+	  (file-name (string-append name "-" version))
+	  (sha256
+		(base32 "0000000000000000000000000000000000000000000000000000"))))
+  (build-system python-build-system)
+  (propagated-inputs
+	`(("python-lxml" ,python-lxml)))
+  (home-page "https://gitlab.liris.cnrs.fr/abrenon/ne-score")
+  (synopsis "A python tool to rate a named-entity annotator.")
+  (description
+	"This tool detects the named-entities in XML-TEI files and computes a diff
+	on them to be able to measure the precision and recall of a named-entities
+	annotator.
+
+	The diff is computed by a generalization of the Levenshtein distance on
+	lists expecting a cost function to describe the edition process and which
+	can return an arbitrary (comparable) type. The implementation returns both
+	the cost and the edition path.")
+  (license bsd-3))
--- a/ne-score
+++ b/ne-score
+#!/usr/bin/python3
+
+import argparse
+from ne_score.sequence import nESequence
+import ne_score.namedEntity as NE
+from sys import argv
+
+def getArgs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("prediction", help="path to the first file to compare")
+    parser.add_argument("truth", help="path to the second file to compare")
+    return parser.parse_args()
+
+def main():
+    args = getArgs()
+    predicted = nESequence(args.prediction)
+    truth = nESequence(args.truth)
+    print([str(s) for s in predicted])
+    print([str(s) for s in truth])
+    diff = NE.levenshtein.diff(predicted, truth)
+    print(str(diff))
+
+if __name__ == '__main__':
+    main()
--- a/ne_score/__init__.py
+++ b/ne_score/__init__.py
--- a/ne_score/cartesianCost.py
+++ b/ne_score/cartesianCost.py
+class CartesianCost:
+    def __init__(self, **kwargs):
+        if 'infinite' in kwargs:
+            self.infinite = True
+        else:
+            self.infinite = False
+            self.factors = {}
+            for k in self.__class__.ranking():
+                self.factors[k] = kwargs[k] if k in kwargs else 0
+    def ranking():
+        raise NotImplementedError("Must override ranking")
+    def getCosts(self):
+        if self.infinite:
+            return (True,)
+        else:
+            factors = [self.factors[k] for k in self.__class__.ranking()]
+            return (False, tuple(factors))
+    def __add__(self, other):
+        if self.infinite or other.infinite:
+            return self.__class__(infinite=True)
+        else:
+            d = dict([
+                (k, self.factors[k] + other.factors[k])
+                for k in self.__class__.ranking()
+                ])
+            return self.__class__(**d)
+    def __lt__(self, other):
+        return self.getCosts() < other.getCosts()
+    def __le__(self, other):
+        return self.getCosts() <= other.getCosts()
+    def __gt__(self, other):
+        return self.getCosts() > other.getCosts()
+    def __ge__(self, other):
+        return self.getCosts() >= other.getCosts()
+    def __str__(self):
+        if self.infinite:
+            return "infinite"
+        else:
+            return str(dict([(k, str(self.factors[k])) for k in self.factors]))
--- a/ne_score/generalLevenshtein.py
+++ b/ne_score/generalLevenshtein.py
+from functools import reduce
+
+class Update:
+    def __init__(self, initial, final):
+        self.initial = initial
+        self.final = final
+    def __str__(self):
+        return str(self.initial) + " -> " + str(self.final)
+
+class Edition:
+    def __init__(self, sign, element):
+        self.sign = sign
+        self.element = element
+    def __str__(self):
+        return self.sign + str(self.element)
+
+def Insert(x):
+    return Edition('+', x)
+
+def Delete(x):
+    return Edition('-', x)
+
+def plus(a, b):
+    if a is None:
+        return b
+    elif b is None:
+        return a
+    else:
+        return a + b
+
+class Diff:
+    def __init__(self, cost, changes):
+        self.cost = cost
+        self.changes = changes
+    def __str__(self):
+        return "{diff}({cost})".format(
+                diff=list(map(str, self.changes)),
+                cost=str(self.cost)
+                )
+    def __add__(self, other):
+        return Diff(plus(self.cost, other.cost), self.changes + other.changes)
+    def __lt__(self, other):
+        return other.cost is not None and (self.cost is None or self.cost < other.cost)
+    def __le__(self, other):
+        return self.cost is None or (other.cost is not None and self.cost <= other.cost)
+    def __gt__(self, other):
+        return self.cost is not None and (other.cost is None or self.cost > other.cost)
+    def __ge__(self, other):
+        return other.cost is None or (self.cost is not None and self.cost >= other.cost)
+
+Diff.empty = Diff(None, [])
+
+class GLevenshtein:
+    def __init__(self, cost):
+        self.cost = cost
+    def rate(self, operation):
+        cost = self.cost(operation)
+        return Diff.empty if cost is None else Diff(cost, [operation])
+    def diff(self, l1, l2):
+        return self._diff(l1, 0, l2, 0)
+    def _diff(self, l1, i1, l2, i2):
+        if i1 >= len(l1):
+            if len(l2) == i2:
+                return self.rate(None)
+            else:
+                return reduce(plus, map(self.rate, map(Insert, l2[i2:])))
+        elif i2 >= len(l2):
+            if len(l1) == i1:
+                return self.rate(None)
+            else:
+                return reduce(plus, map(self.rate, map(Delete, l1[i1:])))
+        else:
+            if l1[i1] == l2[i2]:
+                delta = Diff.empty
+            else:
+                delta = self.rate(Update(l1[i1], l2[i2]))
+            return min(
+                    (self.rate(Delete(l1[i1])) + self._diff(l1, i1+1, l2, i2)),
+                    (self.rate(Insert(l2[i2])) + self._diff(l1, i1, l2, i2+1)),
+                    (delta + self._diff(l1, i1+1, l2, i2+1))
+                    )
--- a/ne_score/namedEntity.py
+++ b/ne_score/namedEntity.py
+from ne_score.cartesianCost import CartesianCost
+from ne_score.generalLevenshtein import Diff, Edition, Update, GLevenshtein
+import ne_score.tokens as tokens
+
+class Cost(CartesianCost):
+    def ranking():
+        return ['missingOrExtra', 'wrongType', 'missingType', 'tokensDiff']
+
+class NE:
+    def __init__(self, NEType, wElems):
+        self.NEType = NEType
+        self.tokens = tokens.get([w.text for w in wElems])
+    def __str__(self):
+        return "|{tokens}|({NEType})".format(
+                tokens=','.join(self.tokens),
+                NEType=self.NEType
+            )
+    def __eq__(self, other):
+        return self.NEType == other.NEType and self.tokens == other.tokens
+
+def cost(operation):
+    if type(operation) == Update:
+        initial = operation.initial
+        final = operation.final
+        if len(set(initial.tokens).intersection(final.tokens)) > 0:
+            missingType = final.NEType is not None and initial.NEType is None
+            return Cost(
+                    wrongType=not missingType and initial.NEType != final.NEType,
+                    missingType=missingType,
+                    tokensDiff=tokens.levenshtein.diff(
+                        initial.tokens,
+                        final.tokens
+                        )
+                    )
+        else:
+            return Cost(infinite=True)
+    elif type(operation) == Edition:
+        return Cost(missingOrExtra=1, tokensDiff=Diff.empty)
+    else:
+        return None
+
+levenshtein = GLevenshtein(cost)
+
+"""
+    def diff(self, other):
+        missingType = self.eNEType is None or other.eNEType is None
+        return {
+                'missingType': missingType,
+                'wrongType': not missingType and self.eNEType != other.eNEType
+                # TODO: write a levenshtein for  tokens with namesWeight and
+                # otherWeight
+            }
+class Cost:
+    def __init__(self, impossible, wrongType, missingType, tokensCost):
+        self.impossible = impossible
+        self.wrongType = wrongType
+        self.missingType = missingType
+        self.tokenCosts = tokenCosts
+    def __add__(self, other):
+        return Cost(
+                self.impossible | other.impossible,
+                self.name + other.name,
+                self.minorWord + self.minorWord
+                )
+    def __lt__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple < otherTriple
+    def __le__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple <= otherTriple
+    def __gt__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple > otherTriple
+    def __ge__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple >= otherTriple
+"""
+
--- a/ene_comparator/sequence.py
+++ b/ene_comparator/sequence.py
 from lxml import etree
+from ne_score.namedEntity import NE
+import ne_score.namedEntity as nE
 import re

-tokenizer = re.compile("[- ']")
-
-def getTokens(words):
-   return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] 
-
-class ENE:
-    def __init__(self, eNEType, wElems):
-        self.eNEType = eNEType
-        self.tokens = getTokens([w.text for w in wElems])
-
-    def diff(self, other):
-        missingType = self.eNEType is None or other.eNEType is None
-        return {
-                'missingType': missingType,
-                'wrongType': not missingType and self.eNEType != other.eNEType
-                # TODO: write a levenshtein for  tokens with namesWeight and
-                # otherWeight
-            }
-
-    def __str__(self):
-        return "|{tokens}|({eNEType})".format(
-                tokens=','.join(self.tokens),
-                eNEType=self.eNEType
-            )
-
-def eNEFromPerdidoElement(e):
+def nEFromPerdidoElement(e):
    ws = e.xpath('.//w')
    wsType = None
    for w in ws:
        if 'type' in w.attrib:
            wsType = w.attrib['type']
-    if (e.attrib['type'], wsType) == ('place', 'NPr'):
-        return ENE('NP_Spatial', ws)
+    #if (e.attrib['type'], wsType) == ('place', 'NPr'):
+    if e.attrib['type'] == 'place':
+        return NE('Spatial', ws)
+    elif e.attrib['type'] == 'date':
+        return NE('Date', ws)
+    elif e.attrib['type'] == 'person':
+        return NE('Person', ws)
    else:
-        return ENE(None, ws)
+        return NE(None, ws)
+
+posPrefix = re.compile("N._")

-def eNEFromTEIWAElement(e):
+def nEFromTEIWAElement(e):
    if e.tag == 'w':
        words = [e]
    else:
        words = e.xpath('./w')
-    return ENE(e.attrib['enc_tags'], words)
+    return NE(posPrefix.sub("", e.attrib['enc_tags']), words)

 def fromPerdido(dom):
-    return map(eNEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))
+    return map(nEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))

 def fromTEIWA(dom):
    elements = dom.xpath('//*[contains(@enc_tags, "NP_")]')
-    return map(eNEFromTEIWAElement, elements)
+    return map(nEFromTEIWAElement, elements)

-def eNESequence(filePath):
+def nESequence(filePath):
    try:
        with open(filePath) as f:
            try:

--- a/ne_score/tokens.py
+++ b/ne_score/tokens.py
+from ne_score.generalLevenshtein import Edition, Update, GLevenshtein
+from ne_score.cartesianCost import CartesianCost
+import re
+
+tokenizer = re.compile("[- ']")
+
+def get(words):
+   return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] 
+
+determiners = {
+        "un", "une", "des"
+        "le", "la", "l'", "les",
+        }
+
+prepositions = {
+        "à", "de", "dans", "pour", "vers", "près", "en", "lès", "sur", "sous",
+        "au", "aux", "du",
+        }
+
+smallWords = determiners.union(prepositions)
+
+class Cost(CartesianCost):
+    def ranking():
+        return ['names', 'smallWords']
+
+def cost(operation):
+    if type(operation) == Update:
+        return Cost(infinite=True)
+    elif type(operation) == Edition:
+        if operation.element in smallWords:
+            return Cost(otherWords=1)
+        else:
+            return Cost(names=1)
+    else:
+        return None
+
+levenshtein = GLevenshtein(cost)
--- a/setup.py
+++ b/setup.py
 from distutils.core import setup

 setup(
-        name="ENE-comparator",
+        name="NE-score",
        version="0.1.0",
        description="yeah, maybe",
        author="Alice Brenon",
        author_email="alice.brenon@liris.cnrs.fr",
-        url="https://gitlab.liris.cnrs.fr/abrenon/ene-comparator",
-        packages=["ene_comparator"],
+        url="https://gitlab.liris.cnrs.fr/abrenon/ne-score",
+        packages=["ne_score"],
        install_requires=['lxml'],
-        scripts=['comparator.py'],
+        scripts=['ne-score'],
        )