Skip to content
Snippets Groups Projects
Commit bfa8aeb4 authored by Alice Brenon's avatar Alice Brenon
Browse files

Generalize the levenshtein implementation to be able to use it on both the...

Generalize the levenshtein implementation to be able to use it on both the tokens and the named-entities lists
parent 15503ca9
No related branches found
No related tags found
No related merge requests found
# ENE comparator
# NE score
Run the `./comparator.py` script. The weights of the edit distance can be
customized with the following parameters (see `--help` to learn more):
Rate a named-entity annotator against a truth reference.
- `--missingENE`
- `--missingType`
- `--wrongType`
- `--namesWeight`
- `--otherWeight`
The `ne-score` script expects two arguments in this order:
- the path to the XML-TEI file containing the predictions
- the path to the XML-TEI file containing the truth
For now it simply displays the named-entity found in the prediction and truth,
then the diff computed on both.
TODO: convert this information in terms of precision/recall, and break it down
according to each named-entity type (Person, Date or Place).
#!/usr/bin/python3
import argparse
from ene_comparator import eNESequence, Metric
from sys import argv
def getArgs():
parser = argparse.ArgumentParser()
parser.add_argument("leftFile", help="path to the first file to compare")
parser.add_argument("rightFile", help="path to the second file to compare")
parser.add_argument("--missingENE", help="weight for a missing ENE",
type=float)
parser.add_argument("--missingType", help="weight for an ENE with a missing type",
type=float)
parser.add_argument("--wrongType", help="weight for an ENE with the wrong type",
type=float)
parser.add_argument("--namesWeight", help="weight for missing \"import\" words (hopefully proper nouns, but technically just not articles or prepositions)",
type=float)
parser.add_argument("--otherWeight", help="weight for other missing words (hopefull only articles or prepositions)",
type=float)
return parser.parse_args()
def getMetric(args):
missingENE = args.missingENE or 10.
missingType = args.missingType or 5.
wrongType = args.wrongType or 5.
namesWeight = args.namesWeight or 0.
otherWeight = args.otherWeight or 0.
return Metric(missingENE, missingType, wrongType, namesWeight, otherWeight)
def main():
args = getArgs()
print(args)
m = getMetric(args)
leftSequence = eNESequence(args.leftFile)
rightSequence = eNESequence(args.rightFile)
print([str(s) for s in leftSequence])
print([str(s) for s in rightSequence])
print(m.differences(leftSequence, rightSequence))
if __name__ == '__main__':
main()
from .editDistance import Metric
from .sequence import eNESequence
def scalar(d1, d2):
s = 0
for k in set(d1.keys()).intersection(d2.keys()):
s += d1[k] * d2[k]
return s
class Metric:
def __init__(self, missingENE, missingType, wrongType, namesWeight,
otherWeight):
self.weight = {
'missingENE': missingENE,
'missingType': missingType,
'wrongType': wrongType,
'namesWeight': namesWeight,
'otherWeight': otherWeight
}
def differences(self, l1, l2):
return self._differences(l1, 0, l2, 0)
def _differences(self, l1, i1, l2, i2):
missingENE = self.weight['missingENE']
if i1 >= len(l1):
return (len(l2)-i2) * missingENE
elif i2 >= len(l2):
return (len(l1)-i1) * missingENE
else:
delta = scalar(l1[i1].diff(l2[i2]), self.weight)
return min(
missingENE + self._differences(l1, i1+1, l2, i2),
missingENE + self._differences(l1, i1, l2, i2+1),
delta + self._differences(l1, i1+1, l2, i2+1)
)
guix.scm 0 → 100644
(use-modules (gnu packages xml)
(guix build-system python)
(guix download)
(guix licenses)
(guix packages))
(package
(name "ne-score")
(version "0.1.0")
(source
(origin
(method url-fetch)
(uri ".")
(file-name (string-append name "-" version))
(sha256
(base32 "0000000000000000000000000000000000000000000000000000"))))
(build-system python-build-system)
(propagated-inputs
`(("python-lxml" ,python-lxml)))
(home-page "https://gitlab.liris.cnrs.fr/abrenon/ne-score")
(synopsis "A python tool to rate a named-entity annotator.")
(description
"This tool detects the named-entities in XML-TEI files and computes a diff
on them to be able to measure the precision and recall of a named-entities
annotator.
The diff is computed by a generalization of the Levenshtein distance on
lists expecting a cost function to describe the edition process and which
can return an arbitrary (comparable) type. The implementation returns both
the cost and the edition path.")
(license bsd-3))
ne-score 0 → 100755
#!/usr/bin/python3
import argparse
from ne_score.sequence import nESequence
import ne_score.namedEntity as NE
from sys import argv
def getArgs():
parser = argparse.ArgumentParser()
parser.add_argument("prediction", help="path to the first file to compare")
parser.add_argument("truth", help="path to the second file to compare")
return parser.parse_args()
def main():
args = getArgs()
predicted = nESequence(args.prediction)
truth = nESequence(args.truth)
print([str(s) for s in predicted])
print([str(s) for s in truth])
diff = NE.levenshtein.diff(predicted, truth)
print(str(diff))
if __name__ == '__main__':
main()
class CartesianCost:
def __init__(self, **kwargs):
if 'infinite' in kwargs:
self.infinite = True
else:
self.infinite = False
self.factors = {}
for k in self.__class__.ranking():
self.factors[k] = kwargs[k] if k in kwargs else 0
def ranking():
raise NotImplementedError("Must override ranking")
def getCosts(self):
if self.infinite:
return (True,)
else:
factors = [self.factors[k] for k in self.__class__.ranking()]
return (False, tuple(factors))
def __add__(self, other):
if self.infinite or other.infinite:
return self.__class__(infinite=True)
else:
d = dict([
(k, self.factors[k] + other.factors[k])
for k in self.__class__.ranking()
])
return self.__class__(**d)
def __lt__(self, other):
return self.getCosts() < other.getCosts()
def __le__(self, other):
return self.getCosts() <= other.getCosts()
def __gt__(self, other):
return self.getCosts() > other.getCosts()
def __ge__(self, other):
return self.getCosts() >= other.getCosts()
def __str__(self):
if self.infinite:
return "infinite"
else:
return str(dict([(k, str(self.factors[k])) for k in self.factors]))
from functools import reduce
class Update:
def __init__(self, initial, final):
self.initial = initial
self.final = final
def __str__(self):
return str(self.initial) + " -> " + str(self.final)
class Edition:
def __init__(self, sign, element):
self.sign = sign
self.element = element
def __str__(self):
return self.sign + str(self.element)
def Insert(x):
return Edition('+', x)
def Delete(x):
return Edition('-', x)
def plus(a, b):
if a is None:
return b
elif b is None:
return a
else:
return a + b
class Diff:
def __init__(self, cost, changes):
self.cost = cost
self.changes = changes
def __str__(self):
return "{diff}({cost})".format(
diff=list(map(str, self.changes)),
cost=str(self.cost)
)
def __add__(self, other):
return Diff(plus(self.cost, other.cost), self.changes + other.changes)
def __lt__(self, other):
return other.cost is not None and (self.cost is None or self.cost < other.cost)
def __le__(self, other):
return self.cost is None or (other.cost is not None and self.cost <= other.cost)
def __gt__(self, other):
return self.cost is not None and (other.cost is None or self.cost > other.cost)
def __ge__(self, other):
return other.cost is None or (self.cost is not None and self.cost >= other.cost)
Diff.empty = Diff(None, [])
class GLevenshtein:
def __init__(self, cost):
self.cost = cost
def rate(self, operation):
cost = self.cost(operation)
return Diff.empty if cost is None else Diff(cost, [operation])
def diff(self, l1, l2):
return self._diff(l1, 0, l2, 0)
def _diff(self, l1, i1, l2, i2):
if i1 >= len(l1):
if len(l2) == i2:
return self.rate(None)
else:
return reduce(plus, map(self.rate, map(Insert, l2[i2:])))
elif i2 >= len(l2):
if len(l1) == i1:
return self.rate(None)
else:
return reduce(plus, map(self.rate, map(Delete, l1[i1:])))
else:
if l1[i1] == l2[i2]:
delta = Diff.empty
else:
delta = self.rate(Update(l1[i1], l2[i2]))
return min(
(self.rate(Delete(l1[i1])) + self._diff(l1, i1+1, l2, i2)),
(self.rate(Insert(l2[i2])) + self._diff(l1, i1, l2, i2+1)),
(delta + self._diff(l1, i1+1, l2, i2+1))
)
from ne_score.cartesianCost import CartesianCost
from ne_score.generalLevenshtein import Diff, Edition, Update, GLevenshtein
import ne_score.tokens as tokens
class Cost(CartesianCost):
def ranking():
return ['missingOrExtra', 'wrongType', 'missingType', 'tokensDiff']
class NE:
def __init__(self, NEType, wElems):
self.NEType = NEType
self.tokens = tokens.get([w.text for w in wElems])
def __str__(self):
return "|{tokens}|({NEType})".format(
tokens=','.join(self.tokens),
NEType=self.NEType
)
def __eq__(self, other):
return self.NEType == other.NEType and self.tokens == other.tokens
def cost(operation):
if type(operation) == Update:
initial = operation.initial
final = operation.final
if len(set(initial.tokens).intersection(final.tokens)) > 0:
missingType = final.NEType is not None and initial.NEType is None
return Cost(
wrongType=not missingType and initial.NEType != final.NEType,
missingType=missingType,
tokensDiff=tokens.levenshtein.diff(
initial.tokens,
final.tokens
)
)
else:
return Cost(infinite=True)
elif type(operation) == Edition:
return Cost(missingOrExtra=1, tokensDiff=Diff.empty)
else:
return None
levenshtein = GLevenshtein(cost)
"""
def diff(self, other):
missingType = self.eNEType is None or other.eNEType is None
return {
'missingType': missingType,
'wrongType': not missingType and self.eNEType != other.eNEType
# TODO: write a levenshtein for tokens with namesWeight and
# otherWeight
}
class Cost:
def __init__(self, impossible, wrongType, missingType, tokensCost):
self.impossible = impossible
self.wrongType = wrongType
self.missingType = missingType
self.tokenCosts = tokenCosts
def __add__(self, other):
return Cost(
self.impossible | other.impossible,
self.name + other.name,
self.minorWord + self.minorWord
)
def __lt__(self, other):
selfTriple = (self.impossible, self.name, self.minorWord)
otherTriple = (other.impossible, other.name, other.minorWord)
return selfTriple < otherTriple
def __le__(self, other):
selfTriple = (self.impossible, self.name, self.minorWord)
otherTriple = (other.impossible, other.name, other.minorWord)
return selfTriple <= otherTriple
def __gt__(self, other):
selfTriple = (self.impossible, self.name, self.minorWord)
otherTriple = (other.impossible, other.name, other.minorWord)
return selfTriple > otherTriple
def __ge__(self, other):
selfTriple = (self.impossible, self.name, self.minorWord)
otherTriple = (other.impossible, other.name, other.minorWord)
return selfTriple >= otherTriple
"""
from lxml import etree
from ne_score.namedEntity import NE
import ne_score.namedEntity as nE
import re
tokenizer = re.compile("[- ']")
def getTokens(words):
return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0]
class ENE:
def __init__(self, eNEType, wElems):
self.eNEType = eNEType
self.tokens = getTokens([w.text for w in wElems])
def diff(self, other):
missingType = self.eNEType is None or other.eNEType is None
return {
'missingType': missingType,
'wrongType': not missingType and self.eNEType != other.eNEType
# TODO: write a levenshtein for tokens with namesWeight and
# otherWeight
}
def __str__(self):
return "|{tokens}|({eNEType})".format(
tokens=','.join(self.tokens),
eNEType=self.eNEType
)
def eNEFromPerdidoElement(e):
def nEFromPerdidoElement(e):
ws = e.xpath('.//w')
wsType = None
for w in ws:
if 'type' in w.attrib:
wsType = w.attrib['type']
if (e.attrib['type'], wsType) == ('place', 'NPr'):
return ENE('NP_Spatial', ws)
#if (e.attrib['type'], wsType) == ('place', 'NPr'):
if e.attrib['type'] == 'place':
return NE('Spatial', ws)
elif e.attrib['type'] == 'date':
return NE('Date', ws)
elif e.attrib['type'] == 'person':
return NE('Person', ws)
else:
return ENE(None, ws)
return NE(None, ws)
posPrefix = re.compile("N._")
def eNEFromTEIWAElement(e):
def nEFromTEIWAElement(e):
if e.tag == 'w':
words = [e]
else:
words = e.xpath('./w')
return ENE(e.attrib['enc_tags'], words)
return NE(posPrefix.sub("", e.attrib['enc_tags']), words)
def fromPerdido(dom):
return map(eNEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))
return map(nEFromPerdidoElement, dom.xpath('//rs[@subtype="no"]'))
def fromTEIWA(dom):
elements = dom.xpath('//*[contains(@enc_tags, "NP_")]')
return map(eNEFromTEIWAElement, elements)
return map(nEFromTEIWAElement, elements)
def eNESequence(filePath):
def nESequence(filePath):
try:
with open(filePath) as f:
try:
......
from ne_score.generalLevenshtein import Edition, Update, GLevenshtein
from ne_score.cartesianCost import CartesianCost
import re
tokenizer = re.compile("[- ']")
def get(words):
return [s for word in words for s in tokenizer.split(word.strip()) if len(s) > 0]
determiners = {
"un", "une", "des"
"le", "la", "l'", "les",
}
prepositions = {
"à", "de", "dans", "pour", "vers", "près", "en", "lès", "sur", "sous",
"au", "aux", "du",
}
smallWords = determiners.union(prepositions)
class Cost(CartesianCost):
def ranking():
return ['names', 'smallWords']
def cost(operation):
if type(operation) == Update:
return Cost(infinite=True)
elif type(operation) == Edition:
if operation.element in smallWords:
return Cost(otherWords=1)
else:
return Cost(names=1)
else:
return None
levenshtein = GLevenshtein(cost)
from distutils.core import setup
setup(
name="ENE-comparator",
name="NE-score",
version="0.1.0",
description="yeah, maybe",
author="Alice Brenon",
author_email="alice.brenon@liris.cnrs.fr",
url="https://gitlab.liris.cnrs.fr/abrenon/ene-comparator",
packages=["ene_comparator"],
url="https://gitlab.liris.cnrs.fr/abrenon/ne-score",
packages=["ne_score"],
install_requires=['lxml'],
scripts=['comparator.py'],
scripts=['ne-score'],
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment