@inproceedings{sanchez-carmona-etal-2025-towards,
title = "Towards Robust Comparisons of {NLP} Models: A Case Study",
author = "Sanchez Carmona, Vicente Ivan and
Jiang, Shanshan and
Dong, Bin",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.332/",
pages = "4973--4979",
abstract = "Comparing the test scores of different NLP models across downstream datasets to determine which model leads to the most accurate results is the ultimate step in any experimental work. Doing so via a single mean score may not accurately quantify the real capabilities of the models. Previous works have proposed diverse statistical tests to improve the comparison of NLP models; however, a key statistical phenomenon remains understudied: variability in test scores. We propose a type of regression analysis which better explains this phenomenon by isolating the effect of both nuisance factors (such as random seeds) and datasets from the effects of the models' capabilities. We showcase our approach via a case study of some of the most popular biomedical NLP models: after isolating nuisance factors and datasets, our results show that the difference between BioLinkBERT and MSR BiomedBERT is, actually, 7 times smaller than previously reported."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sanchez-carmona-etal-2025-towards">
<titleInfo>
<title>Towards Robust Comparisons of NLP Models: A Case Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vicente</namePart>
<namePart type="given">Ivan</namePart>
<namePart type="family">Sanchez Carmona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shanshan</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Comparing the test scores of different NLP models across downstream datasets to determine which model leads to the most accurate results is the ultimate step in any experimental work. Doing so via a single mean score may not accurately quantify the real capabilities of the models. Previous works have proposed diverse statistical tests to improve the comparison of NLP models; however, a key statistical phenomenon remains understudied: variability in test scores. We propose a type of regression analysis which better explains this phenomenon by isolating the effect of both nuisance factors (such as random seeds) and datasets from the effects of the models’ capabilities. We showcase our approach via a case study of some of the most popular biomedical NLP models: after isolating nuisance factors and datasets, our results show that the difference between BioLinkBERT and MSR BiomedBERT is, actually, 7 times smaller than previously reported.</abstract>
<identifier type="citekey">sanchez-carmona-etal-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.332/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4973</start>
<end>4979</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Robust Comparisons of NLP Models: A Case Study
%A Sanchez Carmona, Vicente Ivan
%A Jiang, Shanshan
%A Dong, Bin
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F sanchez-carmona-etal-2025-towards
%X Comparing the test scores of different NLP models across downstream datasets to determine which model leads to the most accurate results is the ultimate step in any experimental work. Doing so via a single mean score may not accurately quantify the real capabilities of the models. Previous works have proposed diverse statistical tests to improve the comparison of NLP models; however, a key statistical phenomenon remains understudied: variability in test scores. We propose a type of regression analysis which better explains this phenomenon by isolating the effect of both nuisance factors (such as random seeds) and datasets from the effects of the models’ capabilities. We showcase our approach via a case study of some of the most popular biomedical NLP models: after isolating nuisance factors and datasets, our results show that the difference between BioLinkBERT and MSR BiomedBERT is, actually, 7 times smaller than previously reported.
%U https://aclanthology.org/2025.coling-main.332/
%P 4973-4979
Markdown (Informal)
[Towards Robust Comparisons of NLP Models: A Case Study](https://aclanthology.org/2025.coling-main.332/) (Sanchez Carmona et al., COLING 2025)
ACL
- Vicente Ivan Sanchez Carmona, Shanshan Jiang, and Bin Dong. 2025. Towards Robust Comparisons of NLP Models: A Case Study. In Proceedings of the 31st International Conference on Computational Linguistics, pages 4973–4979, Abu Dhabi, UAE. Association for Computational Linguistics.