From bfa8aeb4acb1c3f030e1465490f22b996e95a0b3 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 8 Dec 2021 22:04:14 +0100
Subject: [PATCH] Generalize the levenshtein implementation to be able to use
 it on both the tokens and the named-entities lists

---
 README.md                      | 19 ++++----
 comparator.py                  | 42 -----------------
 ene_comparator/__init__.py     |  2 -
 ene_comparator/editDistance.py | 34 --------------
 ene_comparator/sequence.py     | 65 ---------------------------
 guix.scm                       | 31 +++++++++++++
 ne-score                       | 24 ++++++++++
 ne_score/__init__.py           |  0
 ne_score/cartesianCost.py      | 39 ++++++++++++++++
 ne_score/generalLevenshtein.py | 81 +++++++++++++++++++++++++++++++++
 ne_score/namedEntity.py        | 82 ++++++++++++++++++++++++++++++++++
 ne_score/sequence.py           | 49 ++++++++++++++++++++
 ne_score/tokens.py             | 37 +++++++++++++++
 setup.py                       |  8 ++--
 14 files changed, 358 insertions(+), 155 deletions(-)
 delete mode 100755 comparator.py
 delete mode 100644 ene_comparator/__init__.py
 delete mode 100644 ene_comparator/editDistance.py
 delete mode 100644 ene_comparator/sequence.py
 create mode 100644 guix.scm
 create mode 100755 ne-score
 create mode 100644 ne_score/__init__.py
 create mode 100644 ne_score/cartesianCost.py
 create mode 100644 ne_score/generalLevenshtein.py
 create mode 100644 ne_score/namedEntity.py
 create mode 100644 ne_score/sequence.py
 create mode 100644 ne_score/tokens.py

diff --git a/README.md b/README.md
index 91b52d8..e79e7b1 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
-# ENE comparator
+# NE score
 
-Run the `./comparator.py` script. The weights of the edit distance can be
-customized with the following parameters (see `--help` to learn more):
+Rate a named-entity annotator against a truth reference.
 
-- `--missingENE`
-- `--missingType`
-- `--wrongType`
-- `--namesWeight`
-- `--otherWeight`
+The `ne-score` script expects two arguments in this order:
+- the path to the XML-TEI file containing the predictions
+- the path to the XML-TEI file containing the truth
+
+For now it simply displays the named-entity found in the prediction and truth,
+then the diff computed on both.
+
+TODO: convert this information in terms of precision/recall, and break it down
+according to each named-entity type (Person, Date or Place).
diff --git a/comparator.py b/comparator.py
deleted file mode 100755
index d1f44ad..0000000
--- a/comparator.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/python3
-
-import argparse
-from ene_comparator import eNESequence, Metric
-from sys import argv
-
-def getArgs():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("leftFile", help="path to the first file to compare")
-    parser.add_argument("rightFile", help="path to the second file to compare")
-    parser.add_argument("--missingENE", help="weight for a missing ENE",
-                        type=float)
-    parser.add_argument("--missingType", help="weight for an ENE with a missing type",
-                        type=float)
-    parser.add_argument("--wrongType", help="weight for an ENE with the wrong type",
-                        type=float)
-    parser.add_argument("--namesWeight", help="weight for missing \"import\" words (hopefully proper nouns, but technically just not articles or prepositions)",
-                        type=float)
-    parser.add_argument("--otherWeight", help="weight for other missing words (hopefull only articles or prepositions)",
-                        type=float)
-    return parser.parse_args()
-
-def getMetric(args):
-    missingENE = args.missingENE or 10.
-    missingType = args.missingType or 5.
-    wrongType = args.wrongType or 5.
-    namesWeight = args.namesWeight or 0.
-    otherWeight = args.otherWeight or 0.
-    return Metric(missingENE, missingType, wrongType, namesWeight, otherWeight)
-
-def main():
-    args = getArgs()
-    print(args)
-    m = getMetric(args)
-    leftSequence = eNESequence(args.leftFile)
-    rightSequence = eNESequence(args.rightFile)
-    print([str(s) for s in leftSequence])
-    print([str(s) for s in rightSequence])
-    print(m.differences(leftSequence, rightSequence))
-
-if __name__ == '__main__':
-    main()
diff --git a/ene_comparator/__init__.py b/ene_comparator/__init__.py
deleted file mode 100644
index 41ea9c2..0000000
--- a/ene_comparator/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .editDistance import Metric
-from .sequence import eNESequence
diff --git a/ene_comparator/editDistance.py b/ene_comparator/editDistance.py
deleted file mode 100644
index 8ec907f..0000000
--- a/ene_comparator/editDistance.py
+++ /dev/null
@@ -1,34 +0,0 @@
-def scalar(d1, d2):
-    s = 0
-    for k in set(d1.keys()).intersection(d2.keys()):
-        s += d1[k] * d2[k]
-    return s
-
-class Metric:
-    def __init__(self, missingENE, missingType, wrongType, namesWeight,
-            otherWeight):
-        self.weight = {
-                'missingENE': missingENE,
-                'missingType': missingType,
-                'wrongType': wrongType,
-                'namesWeight': namesWeight,
-                'otherWeight': otherWeight
-                }
-
-    def differences(self, l1, l2):
-        return self._differences(l1, 0, l2, 0)
-
-    def _differences(self, l1, i1, l2, i2):
-        missingENE = self.weight['missingENE']
-        if i1 >= len(l1):
-            return (len(l2)-i2) * missingENE
-        elif i2 >= len(l2):
-            return (len(l1)-i1) * missingENE
-        else:
-            delta = scalar(l1[i1].diff(l2[i2]), self.weight)
-            return min(
-                    missingENE + self._differences(l1, i1+1, l2, i2),
-                    missingENE + self._differences(l1, i1, l2, i2+1),
-                    delta + self._differences(l1, i1+1, l2, i2+1)
-                    )
-            
diff --git a/ene_comparator/sequence.py b/ene_comparator/sequence.py
deleted file mode 100644
index 3500a7f..0000000
--- a/ene_comparator/sequence.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from lxml import etree
-import re
-
-tokenizer = re.compile("[- ']")
-
-def getTokens(words):
-   return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] 
-
-class ENE:
-    def __init__(self, eNEType, wElems):
-        self.eNEType = eNEType
-        self.tokens = getTokens([w.text for w in wElems])
-
-    def diff(self, other):
-        missingType = self.eNEType is None or other.eNEType is None
-        return {
-                'missingType': missingType,
-                'wrongType': not missingType and self.eNEType != other.eNEType
-                # TODO: write a levenshtein for  tokens with namesWeight and
-                # otherWeight
-            }
-
-    def __str__(self):
-        return "|{tokens}|({eNEType})".format(
-                tokens=','.join(self.tokens),
-                eNEType=self.eNEType
-            )
-
-def eNEFromPerdidoElement(e):
-    ws = e.xpath('.//w')
-    wsType = None
-    for w in ws:
-        if 'type' in w.attrib:
-            wsType = w.attrib['type']
-    if (e.attrib['type'], wsType) == ('place', 'NPr'):
-        return ENE('NP_Spatial', ws)
-    else:
-        return ENE(None, ws)
-
-def eNEFromTEIWAElement(e):
-    if e.tag == 'w':
-        words = [e]
-    else:
-        words = e.xpath('./w')
-    return ENE(e.attrib['enc_tags'], words)
-
-def fromPerdido(dom):
-    return map(eNEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))
-
-def fromTEIWA(dom):
-    elements = dom.xpath('//*[contains(@enc_tags, "NP_")]')
-    return map(eNEFromTEIWAElement, elements)
-
-def eNESequence(filePath):
-    try:
-        with open(filePath) as f:
-            try:
-                dom = etree.parse(f)
-            except etree.XMLSyntaxError:
-                return None
-            rootTag = dom.getroot().tag
-            sequencer = fromPerdido if rootTag == 'tei.2' else fromTEIWA
-            return list(sequencer(dom))
-    except FileNotFoundError:
-        return None
diff --git a/guix.scm b/guix.scm
new file mode 100644
index 0000000..577180c
--- /dev/null
+++ b/guix.scm
@@ -0,0 +1,31 @@
+(use-modules (gnu packages xml)
+			 (guix build-system python)
+			 (guix download)
+			 (guix licenses)
+			 (guix packages))
+
+(package
+  (name "ne-score")
+  (version "0.1.0")
+  (source
+	(origin
+	  (method url-fetch)
+	  (uri ".")
+	  (file-name (string-append name "-" version))
+	  (sha256
+		(base32 "0000000000000000000000000000000000000000000000000000"))))
+  (build-system python-build-system)
+  (propagated-inputs
+	`(("python-lxml" ,python-lxml)))
+  (home-page "https://gitlab.liris.cnrs.fr/abrenon/ne-score")
+  (synopsis "A python tool to rate a named-entity annotator.")
+  (description
+	"This tool detects the named-entities in XML-TEI files and computes a diff
+	on them to be able to measure the precision and recall of a named-entities
+	annotator.
+
+	The diff is computed by a generalization of the Levenshtein distance on
+	lists expecting a cost function to describe the edition process and which
+	can return an arbitrary (comparable) type. The implementation returns both
+	the cost and the edition path.")
+  (license bsd-3))
diff --git a/ne-score b/ne-score
new file mode 100755
index 0000000..fbd47e8
--- /dev/null
+++ b/ne-score
@@ -0,0 +1,24 @@
+#!/usr/bin/python3
+
+import argparse
+from ne_score.sequence import nESequence
+import ne_score.namedEntity as NE
+from sys import argv
+
+def getArgs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("prediction", help="path to the first file to compare")
+    parser.add_argument("truth", help="path to the second file to compare")
+    return parser.parse_args()
+
+def main():
+    args = getArgs()
+    predicted = nESequence(args.prediction)
+    truth = nESequence(args.truth)
+    print([str(s) for s in predicted])
+    print([str(s) for s in truth])
+    diff = NE.levenshtein.diff(predicted, truth)
+    print(str(diff))
+
+if __name__ == '__main__':
+    main()
diff --git a/ne_score/__init__.py b/ne_score/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ne_score/cartesianCost.py b/ne_score/cartesianCost.py
new file mode 100644
index 0000000..3849fb3
--- /dev/null
+++ b/ne_score/cartesianCost.py
@@ -0,0 +1,39 @@
+class CartesianCost:
+    def __init__(self, **kwargs):
+        if 'infinite' in kwargs:
+            self.infinite = True
+        else:
+            self.infinite = False
+            self.factors = {}
+            for k in self.__class__.ranking():
+                self.factors[k] = kwargs[k] if k in kwargs else 0
+    def ranking():
+        raise NotImplementedError("Must override ranking")
+    def getCosts(self):
+        if self.infinite:
+            return (True,)
+        else:
+            factors = [self.factors[k] for k in self.__class__.ranking()]
+            return (False, tuple(factors))
+    def __add__(self, other):
+        if self.infinite or other.infinite:
+            return self.__class__(infinite=True)
+        else:
+            d = dict([
+                (k, self.factors[k] + other.factors[k])
+                for k in self.__class__.ranking()
+                ])
+            return self.__class__(**d)
+    def __lt__(self, other):
+        return self.getCosts() < other.getCosts()
+    def __le__(self, other):
+        return self.getCosts() <= other.getCosts()
+    def __gt__(self, other):
+        return self.getCosts() > other.getCosts()
+    def __ge__(self, other):
+        return self.getCosts() >= other.getCosts()
+    def __str__(self):
+        if self.infinite:
+            return "infinite"
+        else:
+            return str(dict([(k, str(self.factors[k])) for k in self.factors]))
diff --git a/ne_score/generalLevenshtein.py b/ne_score/generalLevenshtein.py
new file mode 100644
index 0000000..bf4130b
--- /dev/null
+++ b/ne_score/generalLevenshtein.py
@@ -0,0 +1,81 @@
+from functools import reduce
+
+class Update:
+    def __init__(self, initial, final):
+        self.initial = initial
+        self.final = final
+    def __str__(self):
+        return str(self.initial) + " -> " + str(self.final)
+
+class Edition:
+    def __init__(self, sign, element):
+        self.sign = sign
+        self.element = element
+    def __str__(self):
+        return self.sign + str(self.element)
+
+def Insert(x):
+    return Edition('+', x)
+
+def Delete(x):
+    return Edition('-', x)
+
+def plus(a, b):
+    if a is None:
+        return b
+    elif b is None:
+        return a
+    else:
+        return a + b
+
+class Diff:
+    def __init__(self, cost, changes):
+        self.cost = cost
+        self.changes = changes
+    def __str__(self):
+        return "{diff}({cost})".format(
+                diff=list(map(str, self.changes)),
+                cost=str(self.cost)
+                )
+    def __add__(self, other):
+        return Diff(plus(self.cost, other.cost), self.changes + other.changes)
+    def __lt__(self, other):
+        return other.cost is not None and (self.cost is None or self.cost < other.cost)
+    def __le__(self, other):
+        return self.cost is None or (other.cost is not None and self.cost <= other.cost)
+    def __gt__(self, other):
+        return self.cost is not None and (other.cost is None or self.cost > other.cost)
+    def __ge__(self, other):
+        return other.cost is None or (self.cost is not None and self.cost >= other.cost)
+
+Diff.empty = Diff(None, [])
+
+class GLevenshtein:
+    def __init__(self, cost):
+        self.cost = cost
+    def rate(self, operation):
+        cost = self.cost(operation)
+        return Diff.empty if cost is None else Diff(cost, [operation])
+    def diff(self, l1, l2):
+        return self._diff(l1, 0, l2, 0)
+    def _diff(self, l1, i1, l2, i2):
+        if i1 >= len(l1):
+            if len(l2) == i2:
+                return self.rate(None)
+            else:
+                return reduce(plus, map(self.rate, map(Insert, l2[i2:])))
+        elif i2 >= len(l2):
+            if len(l1) == i1:
+                return self.rate(None)
+            else:
+                return reduce(plus, map(self.rate, map(Delete, l1[i1:])))
+        else:
+            if l1[i1] == l2[i2]:
+                delta = Diff.empty
+            else:
+                delta = self.rate(Update(l1[i1], l2[i2]))
+            return min(
+                    (self.rate(Delete(l1[i1])) + self._diff(l1, i1+1, l2, i2)),
+                    (self.rate(Insert(l2[i2])) + self._diff(l1, i1, l2, i2+1)),
+                    (delta + self._diff(l1, i1+1, l2, i2+1))
+                    )
diff --git a/ne_score/namedEntity.py b/ne_score/namedEntity.py
new file mode 100644
index 0000000..daa438f
--- /dev/null
+++ b/ne_score/namedEntity.py
@@ -0,0 +1,82 @@
+from ne_score.cartesianCost import CartesianCost
+from ne_score.generalLevenshtein import Diff, Edition, Update, GLevenshtein
+import ne_score.tokens as tokens
+
+class Cost(CartesianCost):
+    def ranking():
+        return ['missingOrExtra', 'wrongType', 'missingType', 'tokensDiff']
+
+class NE:
+    def __init__(self, NEType, wElems):
+        self.NEType = NEType
+        self.tokens = tokens.get([w.text for w in wElems])
+    def __str__(self):
+        return "|{tokens}|({NEType})".format(
+                tokens=','.join(self.tokens),
+                NEType=self.NEType
+            )
+    def __eq__(self, other):
+        return self.NEType == other.NEType and self.tokens == other.tokens
+
+def cost(operation):
+    if type(operation) == Update:
+        initial = operation.initial
+        final = operation.final
+        if len(set(initial.tokens).intersection(final.tokens)) > 0:
+            missingType = final.NEType is not None and initial.NEType is None
+            return Cost(
+                    wrongType=not missingType and initial.NEType != final.NEType,
+                    missingType=missingType,
+                    tokensDiff=tokens.levenshtein.diff(
+                        initial.tokens,
+                        final.tokens
+                        )
+                    )
+        else:
+            return Cost(infinite=True)
+    elif type(operation) == Edition:
+        return Cost(missingOrExtra=1, tokensDiff=Diff.empty)
+    else:
+        return None
+
+levenshtein = GLevenshtein(cost)
+
+"""
+    def diff(self, other):
+        missingType = self.eNEType is None or other.eNEType is None
+        return {
+                'missingType': missingType,
+                'wrongType': not missingType and self.eNEType != other.eNEType
+                # TODO: write a levenshtein for  tokens with namesWeight and
+                # otherWeight
+            }
+class Cost:
+    def __init__(self, impossible, wrongType, missingType, tokensCost):
+        self.impossible = impossible
+        self.wrongType = wrongType
+        self.missingType = missingType
+        self.tokenCosts = tokenCosts
+    def __add__(self, other):
+        return Cost(
+                self.impossible | other.impossible,
+                self.name + other.name,
+                self.minorWord + self.minorWord
+                )
+    def __lt__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple < otherTriple
+    def __le__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple <= otherTriple
+    def __gt__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple > otherTriple
+    def __ge__(self, other):
+        selfTriple = (self.impossible, self.name, self.minorWord)
+        otherTriple = (other.impossible, other.name, other.minorWord)
+        return selfTriple >= otherTriple
+"""
+
diff --git a/ne_score/sequence.py b/ne_score/sequence.py
new file mode 100644
index 0000000..315d6ca
--- /dev/null
+++ b/ne_score/sequence.py
@@ -0,0 +1,49 @@
+from lxml import etree
+from ne_score.namedEntity import NE
+import ne_score.namedEntity as nE
+import re
+
+def nEFromPerdidoElement(e):
+    ws = e.xpath('.//w')
+    wsType = None
+    for w in ws:
+        if 'type' in w.attrib:
+            wsType = w.attrib['type']
+    #if (e.attrib['type'], wsType) == ('place', 'NPr'):
+    if e.attrib['type'] == 'place':
+        return NE('Spatial', ws)
+    elif e.attrib['type'] == 'date':
+        return NE('Date', ws)
+    elif e.attrib['type'] == 'person':
+        return NE('Person', ws)
+    else:
+        return NE(None, ws)
+
+posPrefix = re.compile("N._")
+
+def nEFromTEIWAElement(e):
+    if e.tag == 'w':
+        words = [e]
+    else:
+        words = e.xpath('./w')
+    return NE(posPrefix.sub("", e.attrib['enc_tags']), words)
+
+def fromPerdido(dom):
+    return map(nEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))
+
+def fromTEIWA(dom):
+    elements = dom.xpath('//*[contains(@enc_tags, "NP_")]')
+    return map(nEFromTEIWAElement, elements)
+
+def nESequence(filePath):
+    try:
+        with open(filePath) as f:
+            try:
+                dom = etree.parse(f)
+            except etree.XMLSyntaxError:
+                return None
+            rootTag = dom.getroot().tag
+            sequencer = fromPerdido if rootTag == 'tei.2' else fromTEIWA
+            return list(sequencer(dom))
+    except FileNotFoundError:
+        return None
diff --git a/ne_score/tokens.py b/ne_score/tokens.py
new file mode 100644
index 0000000..e189495
--- /dev/null
+++ b/ne_score/tokens.py
@@ -0,0 +1,37 @@
+from ne_score.generalLevenshtein import Edition, Update, GLevenshtein
+from ne_score.cartesianCost import CartesianCost
+import re
+
+tokenizer = re.compile("[- ']")
+
+def get(words):
+   return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0] 
+
+determiners = {
+        "un", "une", "des"
+        "le", "la", "l'", "les",
+        }
+
+prepositions = {
+        "à", "de", "dans", "pour", "vers", "près", "en", "lès", "sur", "sous",
+        "au", "aux", "du",
+        }
+
+smallWords = determiners.union(prepositions)
+
+class Cost(CartesianCost):
+    def ranking():
+        return ['names', 'smallWords']
+
+def cost(operation):
+    if type(operation) == Update:
+        return Cost(infinite=True)
+    elif type(operation) == Edition:
+        if operation.element in smallWords:
+            return Cost(otherWords=1)
+        else:
+            return Cost(names=1)
+    else:
+        return None
+
+levenshtein = GLevenshtein(cost)
diff --git a/setup.py b/setup.py
index 5633a7a..f9bb05d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,14 @@
 from distutils.core import setup
 
 setup(
-        name="ENE-comparator",
+        name="NE-score",
         version="0.1.0",
         description="yeah, maybe",
         author="Alice Brenon",
         author_email="alice.brenon@liris.cnrs.fr",
-        url="https://gitlab.liris.cnrs.fr/abrenon/ene-comparator",
-        packages=["ene_comparator"],
+        url="https://gitlab.liris.cnrs.fr/abrenon/ne-score",
+        packages=["ne_score"],
         install_requires=['lxml'],
-        scripts=['comparator.py'],
+        scripts=['ne-score'],
         )
 
-- 
GitLab