@inproceedings{bhadauria-etal-2024-effects,
title = "The Effects of Data Quality on Named Entity Recognition",
author = "Bhadauria, Divya and
Sierra M{\'u}nera, Alejandro and
Krestel, Ralf",
editor = {van der Goot, Rob and
Bak, JinYeong and
M{\"u}ller-Eberstein, Max and
Xu, Wei and
Ritter, Alan and
Baldwin, Tim},
booktitle = "Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)",
month = mar,
year = "2024",
address = "San {\.G}iljan, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wnut-1.8",
pages = "79--88",
abstract = "The extraction of valuable information from the vast amount of digital data available today has become increasingly important, making Named Entity Recognition models an essential component of information extraction tasks. This emphasizes the importance of understanding the factors that can compromise the performance of these models. Many studies have examined the impact of data annotation errors on NER models, leaving the broader implication of overall data quality on these models unexplored. In this work, we evaluate the robustness of three prominent NER models on datasets with varying amounts of textual noise types. The results show that as the noise in the dataset increases, model performance declines, with a minor impact for some noise types and a significant drop in performance for others. The findings of this research can be used as a foundation for building robust NER systems by enhancing dataset quality beforehand.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bhadauria-etal-2024-effects">
<titleInfo>
<title>The Effects of Data Quality on Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Divya</namePart>
<namePart type="family">Bhadauria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alejandro</namePart>
<namePart type="family">Sierra Múnera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ralf</namePart>
<namePart type="family">Krestel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">van der Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">JinYeong</namePart>
<namePart type="family">Bak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Müller-Eberstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Ġiljan, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The extraction of valuable information from the vast amount of digital data available today has become increasingly important, making Named Entity Recognition models an essential component of information extraction tasks. This emphasizes the importance of understanding the factors that can compromise the performance of these models. Many studies have examined the impact of data annotation errors on NER models, leaving the broader implication of overall data quality on these models unexplored. In this work, we evaluate the robustness of three prominent NER models on datasets with varying amounts of textual noise types. The results show that as the noise in the dataset increases, model performance declines, with a minor impact for some noise types and a significant drop in performance for others. The findings of this research can be used as a foundation for building robust NER systems by enhancing dataset quality beforehand.</abstract>
<identifier type="citekey">bhadauria-etal-2024-effects</identifier>
<location>
<url>https://aclanthology.org/2024.wnut-1.8</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>79</start>
<end>88</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Effects of Data Quality on Named Entity Recognition
%A Bhadauria, Divya
%A Sierra Múnera, Alejandro
%A Krestel, Ralf
%Y van der Goot, Rob
%Y Bak, JinYeong
%Y Müller-Eberstein, Max
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%S Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C San Ġiljan, Malta
%F bhadauria-etal-2024-effects
%X The extraction of valuable information from the vast amount of digital data available today has become increasingly important, making Named Entity Recognition models an essential component of information extraction tasks. This emphasizes the importance of understanding the factors that can compromise the performance of these models. Many studies have examined the impact of data annotation errors on NER models, leaving the broader implication of overall data quality on these models unexplored. In this work, we evaluate the robustness of three prominent NER models on datasets with varying amounts of textual noise types. The results show that as the noise in the dataset increases, model performance declines, with a minor impact for some noise types and a significant drop in performance for others. The findings of this research can be used as a foundation for building robust NER systems by enhancing dataset quality beforehand.
%U https://aclanthology.org/2024.wnut-1.8
%P 79-88
Markdown (Informal)
[The Effects of Data Quality on Named Entity Recognition](https://aclanthology.org/2024.wnut-1.8) (Bhadauria et al., WNUT-WS 2024)
ACL
- Divya Bhadauria, Alejandro Sierra Múnera, and Ralf Krestel. 2024. The Effects of Data Quality on Named Entity Recognition. In Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024), pages 79–88, San Ġiljan, Malta. Association for Computational Linguistics.