@inproceedings{amalvy-etal-2023-learning,
title = "Learning to Rank Context for Named Entity Recognition Using a Synthetic Dataset",
author = "Amalvy, Arthur and
Labatut, Vincent and
Dufour, Richard",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.642",
doi = "10.18653/v1/2023.emnlp-main.642",
pages = "10372--10382",
abstract = "While recent pre-trained transformer-based models can perform named entity recognition (NER) with great accuracy, their limited range remains an issue when applied to long documents such as whole novels. To alleviate this issue, a solution is to retrieve relevant context at the document level. Unfortunately, the lack of supervision for such a task means one has to settle for unsupervised approaches. Instead, we propose to generate a synthetic context retrieval training dataset using Alpaca, an instruction-tuned large language model (LLM). Using this dataset, we train a neural context retriever based on a BERT model that is able to find relevant context for NER. We show that our method outperforms several retrieval baselines for the NER task on an English literary dataset composed of the first chapter of 40 books.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="amalvy-etal-2023-learning">
<titleInfo>
<title>Learning to Rank Context for Named Entity Recognition Using a Synthetic Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arthur</namePart>
<namePart type="family">Amalvy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Labatut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Dufour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While recent pre-trained transformer-based models can perform named entity recognition (NER) with great accuracy, their limited range remains an issue when applied to long documents such as whole novels. To alleviate this issue, a solution is to retrieve relevant context at the document level. Unfortunately, the lack of supervision for such a task means one has to settle for unsupervised approaches. Instead, we propose to generate a synthetic context retrieval training dataset using Alpaca, an instruction-tuned large language model (LLM). Using this dataset, we train a neural context retriever based on a BERT model that is able to find relevant context for NER. We show that our method outperforms several retrieval baselines for the NER task on an English literary dataset composed of the first chapter of 40 books.</abstract>
<identifier type="citekey">amalvy-etal-2023-learning</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.642</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.642</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>10372</start>
<end>10382</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Rank Context for Named Entity Recognition Using a Synthetic Dataset
%A Amalvy, Arthur
%A Labatut, Vincent
%A Dufour, Richard
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F amalvy-etal-2023-learning
%X While recent pre-trained transformer-based models can perform named entity recognition (NER) with great accuracy, their limited range remains an issue when applied to long documents such as whole novels. To alleviate this issue, a solution is to retrieve relevant context at the document level. Unfortunately, the lack of supervision for such a task means one has to settle for unsupervised approaches. Instead, we propose to generate a synthetic context retrieval training dataset using Alpaca, an instruction-tuned large language model (LLM). Using this dataset, we train a neural context retriever based on a BERT model that is able to find relevant context for NER. We show that our method outperforms several retrieval baselines for the NER task on an English literary dataset composed of the first chapter of 40 books.
%R 10.18653/v1/2023.emnlp-main.642
%U https://aclanthology.org/2023.emnlp-main.642
%U https://doi.org/10.18653/v1/2023.emnlp-main.642
%P 10372-10382
Markdown (Informal)
[Learning to Rank Context for Named Entity Recognition Using a Synthetic Dataset](https://aclanthology.org/2023.emnlp-main.642) (Amalvy et al., EMNLP 2023)
ACL