@inproceedings{benito-santos-etal-2025-robust,
title = "Robust Estimation of Population-Level Effects in Repeated-Measures {NLP} Experimental Designs",
author = "Benito-Santos, Alejandro and
Ghajari, Adrian and
Fresno, V{\'i}ctor",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1586/",
doi = "10.18653/v1/2025.acl-long.1586",
pages = "33076--33089",
ISBN = "979-8-89176-251-0",
abstract = "NLP research frequently grapples with multiple sources of variability{---}spanning runs, datasets, annotators, and more{---}yet conventional analysis methods often neglect these hierarchical structures, threatening the reproducibility of findings. To address this gap, we contribute a case study illustrating how linear mixed-effects models (LMMs) can rigorously capture systematic language-dependent differences (i.e., population-level effects) in a population of monolingual and multilingual language models. In the context of a bilingual hate speech detection task, we demonstrate that LMMs can uncover significant population-level effects{---}even under low-resource (small-N) experimental designs{---}while mitigating confounds and random noise. By setting out a transparent blueprint for repeated-measures experimentation, we encourage the NLP community to embrace variability as a feature, rather than a nuisance, in order to advance more robust, reproducible, and ultimately trustworthy results."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="benito-santos-etal-2025-robust">
<titleInfo>
<title>Robust Estimation of Population-Level Effects in Repeated-Measures NLP Experimental Designs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alejandro</namePart>
<namePart type="family">Benito-Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Ghajari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="family">Fresno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>NLP research frequently grapples with multiple sources of variability—spanning runs, datasets, annotators, and more—yet conventional analysis methods often neglect these hierarchical structures, threatening the reproducibility of findings. To address this gap, we contribute a case study illustrating how linear mixed-effects models (LMMs) can rigorously capture systematic language-dependent differences (i.e., population-level effects) in a population of monolingual and multilingual language models. In the context of a bilingual hate speech detection task, we demonstrate that LMMs can uncover significant population-level effects—even under low-resource (small-N) experimental designs—while mitigating confounds and random noise. By setting out a transparent blueprint for repeated-measures experimentation, we encourage the NLP community to embrace variability as a feature, rather than a nuisance, in order to advance more robust, reproducible, and ultimately trustworthy results.</abstract>
<identifier type="citekey">benito-santos-etal-2025-robust</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1586</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1586/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>33076</start>
<end>33089</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Estimation of Population-Level Effects in Repeated-Measures NLP Experimental Designs
%A Benito-Santos, Alejandro
%A Ghajari, Adrian
%A Fresno, Víctor
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F benito-santos-etal-2025-robust
%X NLP research frequently grapples with multiple sources of variability—spanning runs, datasets, annotators, and more—yet conventional analysis methods often neglect these hierarchical structures, threatening the reproducibility of findings. To address this gap, we contribute a case study illustrating how linear mixed-effects models (LMMs) can rigorously capture systematic language-dependent differences (i.e., population-level effects) in a population of monolingual and multilingual language models. In the context of a bilingual hate speech detection task, we demonstrate that LMMs can uncover significant population-level effects—even under low-resource (small-N) experimental designs—while mitigating confounds and random noise. By setting out a transparent blueprint for repeated-measures experimentation, we encourage the NLP community to embrace variability as a feature, rather than a nuisance, in order to advance more robust, reproducible, and ultimately trustworthy results.
%R 10.18653/v1/2025.acl-long.1586
%U https://aclanthology.org/2025.acl-long.1586/
%U https://doi.org/10.18653/v1/2025.acl-long.1586
%P 33076-33089
Markdown (Informal)
[Robust Estimation of Population-Level Effects in Repeated-Measures NLP Experimental Designs](https://aclanthology.org/2025.acl-long.1586/) (Benito-Santos et al., ACL 2025)
ACL