From cf7ab0cdafd91352458d483de8084e5e4e107444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Sebe?= <clemence.sebe@universite-paris-saclay.fr> Date: Wed, 4 Jun 2025 09:52:24 +0000 Subject: [PATCH] Update README.md --- README.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4501427..6a86839 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,22 @@ This directory contains all the necessary information and scripts to reproduce the results presented in : ``` -@misc{sebe2024extractinginformationlowresourcesetting, - title={Extracting Information in a Low-resource Setting: Case Study on Bioinformatics Workflows}, - author={Clémence Sebe and Sarah Cohen-Boulakia and Olivier Ferret and Aurélie Névéol}, - year={2024}, - eprint={2411.19295}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2411.19295}, +@InProceedings{10.1007/978-3-031-91398-3_21, + author="Sebe, Cl{\'e}mence + and Cohen-Boulakia, Sarah + and Ferret, Olivier + and N{\'e}v{\'e}ol, Aur{\'e}lie", + editor="Krempl, Georg + and Puolam{\"a}ki, Kai + and Miliou, Ioanna", + title="Extracting Information in a Low-Resource Setting: Case Study on Bioinformatics Workflows", + booktitle="Advances in Intelligent Data Analysis XXIII", + year="2025", + publisher="Springer Nature Switzerland", + address="Cham", + pages="274--287", + abstract="Bioinformatics workflows are essential for complex biological data analyses and are often described in scientific articles with source code in public repositories. Extracting detailed workflow information from articles can improve accessibility and reusability but is hindered by limited annotated corpora. To address this, we framed the problem as a low-resource extraction task and tested four strategies: 1) creating a tailored annotated corpus, 2) few-shot named-entity recognition (NER) with an autoregressive language model, 3) NER using masked language models with existing and new corpora, and 4) integrating workflow knowledge into NER models. Using BioToFlow, a new corpus of 52 articles annotated with 16 entities, a SciBERT-based NER model achieved a 70.4 F-measure, comparable to inter-annotator agreement. While knowledge integration improved performance for specific entities, it was less effective across the entire information schema. Our results demonstrate that high-performance information extraction for bioinformatics workflows is achievable.", + isbn="978-3-031-91398-3" } ``` -- GitLab