Skip to content
Snippets Groups Projects
Commit a8ce8b82 authored by Alice Brenon's avatar Alice Brenon
Browse files

Reimplement CoNLL output to expose named entities in column MISC

parent 210311e5
No related branches found
No related tags found
No related merge requests found
...@@ -10,6 +10,37 @@ from tqdm import tqdm ...@@ -10,6 +10,37 @@ from tqdm import tqdm
def oneLine(sentence): def oneLine(sentence):
return sentence.text.replace('\n', ' ').replace('\t', ' ') return sentence.text.replace('\n', ' ').replace('\t', ' ')
def coNLLGet(token, attribute):
return str(token[attribute]) if attribute in token else '_'
def singleOrRange(token_id):
if type(token_id) == int:
return str(token_id)
else:
return '-'.join(map(str, token_id))
def miscAttribute(token):
return lambda key: [f"{key}={token[key]}"] if key in token else []
def misc(token):
keys = ['start_char', 'end_char', 'ner']
attributes = sum(map(miscAttribute(token), keys), [])
return '|'.join(attributes) if len(attributes) > 0 else '_'
def formatToken(token):
return '\t'.join([
singleOrRange(token['id']),
coNLLGet(token, "text"),
coNLLGet(token, "lemma"),
coNLLGet(token, "upos"),
coNLLGet(token, "xpos"),
coNLLGet(token, "feats"),
coNLLGet(token, "head"),
coNLLGet(token, "deprel"),
coNLLGet(token, "deps"),
misc(token)
])
class Annotator: class Annotator:
def __init__(self, source, target): def __init__(self, source, target):
self.source = source self.source = source
...@@ -57,16 +88,13 @@ class Annotator: ...@@ -57,16 +88,13 @@ class Annotator:
if len(parsed.sentences) > 0: if len(parsed.sentences) > 0:
print(f'# newpar id = {self.newpar()}', file=target) print(f'# newpar id = {self.newpar()}', file=target)
for sentence in parsed.sentences: for sentence in parsed.sentences:
sentence.add_comment(f'# sent_id = {self.newsent()}') print(self.annotate_sentence(sentence), file=target)
sentence.add_comment(f'# text = {oneLine(sentence)}') def annotate_sentence(self, sentence):
print(CoNLL.doc2conll_text(parsed), file=target, end='') return "\n".join(
[f'# sent_id = {self.newsent()}',
# def annotate_paragraph(self, paragraph, target): f'# text = {oneLine(sentence)}'] +
# parsed = self.model(paragraph) list(map(formatToken, sentence.to_dict())) +
# for sentence in parsed.sentences: [''])
# sentence.add_comment(f'# sent_id = {self.newsent()}')
# sentence.add_comment(f'# text = {oneLine(sentence)}')
# print(CoNLL.doc2conll_text(parsed), file=target, end='')
def newpar(self): def newpar(self):
self.paragraphSerial += 1 self.paragraphSerial += 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment