@inproceedings{babl-etal-2025-random,
title = "Random Splitting Negatively Impacts {NER} Evaluation: Quantifying and Eliminating the Overestimation of {NER} Performance",
author = "Babl, Florian and
Hennen, Moritz and
Murauer, Jakob and
Geierhos, Michaela",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.504/",
doi = "10.18653/v1/2025.findings-acl.504",
pages = "9724--9738",
ISBN = "979-8-89176-256-5",
abstract = "In named entity recognition (NER), models are evaluated on their ability to identify entity mentions in text. However, standard evaluation methods often rely on test sets that contain named entities already present in the training data, raising concerns about overestimation of model performance.This work investigates the impact of varying degrees of entity contamination on a dataset level on the generalization ability and reported F1 scores of three state-of-the-art NER models.Experiments on five standard benchmarks show that F1 scores for contaminated entities statistically significantly inflate reported F1 scores as contamination rates increase, with F1 performance gaps ranging from 2-10{\%} compared to entities not seen during training.To address these inflated F1 scores, we additionally propose a novel NER dataset splitting method using a minimum cut algorithm to minimize train-test entity leakage.While our splitting method ensures near-zero entity contamination, we also compare new and existing dataset splits on named entity sample counts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="babl-etal-2025-random">
<titleInfo>
<title>Random Splitting Negatively Impacts NER Evaluation: Quantifying and Eliminating the Overestimation of NER Performance</title>
</titleInfo>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Babl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moritz</namePart>
<namePart type="family">Hennen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Murauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michaela</namePart>
<namePart type="family">Geierhos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>In named entity recognition (NER), models are evaluated on their ability to identify entity mentions in text. However, standard evaluation methods often rely on test sets that contain named entities already present in the training data, raising concerns about overestimation of model performance.This work investigates the impact of varying degrees of entity contamination on a dataset level on the generalization ability and reported F1 scores of three state-of-the-art NER models.Experiments on five standard benchmarks show that F1 scores for contaminated entities statistically significantly inflate reported F1 scores as contamination rates increase, with F1 performance gaps ranging from 2-10% compared to entities not seen during training.To address these inflated F1 scores, we additionally propose a novel NER dataset splitting method using a minimum cut algorithm to minimize train-test entity leakage.While our splitting method ensures near-zero entity contamination, we also compare new and existing dataset splits on named entity sample counts.</abstract>
<identifier type="citekey">babl-etal-2025-random</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.504</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.504/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>9724</start>
<end>9738</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Random Splitting Negatively Impacts NER Evaluation: Quantifying and Eliminating the Overestimation of NER Performance
%A Babl, Florian
%A Hennen, Moritz
%A Murauer, Jakob
%A Geierhos, Michaela
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F babl-etal-2025-random
%X In named entity recognition (NER), models are evaluated on their ability to identify entity mentions in text. However, standard evaluation methods often rely on test sets that contain named entities already present in the training data, raising concerns about overestimation of model performance.This work investigates the impact of varying degrees of entity contamination on a dataset level on the generalization ability and reported F1 scores of three state-of-the-art NER models.Experiments on five standard benchmarks show that F1 scores for contaminated entities statistically significantly inflate reported F1 scores as contamination rates increase, with F1 performance gaps ranging from 2-10% compared to entities not seen during training.To address these inflated F1 scores, we additionally propose a novel NER dataset splitting method using a minimum cut algorithm to minimize train-test entity leakage.While our splitting method ensures near-zero entity contamination, we also compare new and existing dataset splits on named entity sample counts.
%R 10.18653/v1/2025.findings-acl.504
%U https://aclanthology.org/2025.findings-acl.504/
%U https://doi.org/10.18653/v1/2025.findings-acl.504
%P 9724-9738
Markdown (Informal)
[Random Splitting Negatively Impacts NER Evaluation: Quantifying and Eliminating the Overestimation of NER Performance](https://aclanthology.org/2025.findings-acl.504/) (Babl et al., Findings 2025)
ACL