@inproceedings{gajo-barron-cedeno-2024-cross,
title = "On Cross-Language Entity Label Projection and Recognition",
author = "Gajo, Paolo and
Barr{\'o}n-Cede{\~n}o, Alberto",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.47/",
pages = "390--402",
ISBN = "979-12-210-7060-6",
abstract = "Most work on named entity recognition (NER) focuses solely on English. Through the use of training data augmentation via machine translation (MT), multilingual NER can become a powerful tool for information extraction in multilingual contexts. In this paper, we augment NER data from culinary recipe ingredient lists, by means of MT and word alignment (WA), following two approaches: (i) translating each entity separately, while taking into account the full context of the list and (ii) translating the whole list of ingredients and then aligning entities using three types of WA models: Giza++, Fast Align, and BERT, fine-tuned using a novel entity-shuffling approach. We depart from English data and produce Italian versions via MT, span-annotated with the entities projected from English. Then, we use the data produced by the two approaches to train mono- and multilingual NER BERT models. We test the performance of the WA and NER models on an annotated dataset of ingredient lists, partially out-of-domain compared to the training data. The results show that shuffling entities leads to better BERT aligner models. The higher quality NER data created by these models enables NER models to achieve better results, with multilingual models reaching performances equal to or greater than their monolingual counterparts."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gajo-barron-cedeno-2024-cross">
<titleInfo>
<title>On Cross-Language Entity Label Projection and Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paolo</namePart>
<namePart type="family">Gajo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Barrón-Cedeño</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>Most work on named entity recognition (NER) focuses solely on English. Through the use of training data augmentation via machine translation (MT), multilingual NER can become a powerful tool for information extraction in multilingual contexts. In this paper, we augment NER data from culinary recipe ingredient lists, by means of MT and word alignment (WA), following two approaches: (i) translating each entity separately, while taking into account the full context of the list and (ii) translating the whole list of ingredients and then aligning entities using three types of WA models: Giza++, Fast Align, and BERT, fine-tuned using a novel entity-shuffling approach. We depart from English data and produce Italian versions via MT, span-annotated with the entities projected from English. Then, we use the data produced by the two approaches to train mono- and multilingual NER BERT models. We test the performance of the WA and NER models on an annotated dataset of ingredient lists, partially out-of-domain compared to the training data. The results show that shuffling entities leads to better BERT aligner models. The higher quality NER data created by these models enables NER models to achieve better results, with multilingual models reaching performances equal to or greater than their monolingual counterparts.</abstract>
<identifier type="citekey">gajo-barron-cedeno-2024-cross</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.47/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>390</start>
<end>402</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On Cross-Language Entity Label Projection and Recognition
%A Gajo, Paolo
%A Barrón-Cedeño, Alberto
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F gajo-barron-cedeno-2024-cross
%X Most work on named entity recognition (NER) focuses solely on English. Through the use of training data augmentation via machine translation (MT), multilingual NER can become a powerful tool for information extraction in multilingual contexts. In this paper, we augment NER data from culinary recipe ingredient lists, by means of MT and word alignment (WA), following two approaches: (i) translating each entity separately, while taking into account the full context of the list and (ii) translating the whole list of ingredients and then aligning entities using three types of WA models: Giza++, Fast Align, and BERT, fine-tuned using a novel entity-shuffling approach. We depart from English data and produce Italian versions via MT, span-annotated with the entities projected from English. Then, we use the data produced by the two approaches to train mono- and multilingual NER BERT models. We test the performance of the WA and NER models on an annotated dataset of ingredient lists, partially out-of-domain compared to the training data. The results show that shuffling entities leads to better BERT aligner models. The higher quality NER data created by these models enables NER models to achieve better results, with multilingual models reaching performances equal to or greater than their monolingual counterparts.
%U https://aclanthology.org/2024.clicit-1.47/
%P 390-402
Markdown (Informal)
[On Cross-Language Entity Label Projection and Recognition](https://aclanthology.org/2024.clicit-1.47/) (Gajo & Barrón-Cedeño, CLiC-it 2024)
ACL