@inproceedings{boglioni-etal-2026-generalisation,
title = "Do Generalisation Results Generalise?",
author = "Boglioni, Matteo and
Sgobbi, Andrea and
Tavernini, Gabriel and
Rita, Francesco and
Mosbach, Marius and
Pimentel, Tiago",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1284/",
pages = "25763--25783",
ISBN = "979-8-89176-395-1",
abstract = "A large language model{'}s (LLM{'}s) out-of-distribution (OOD) generalisation is crucial to its deployment. Previous work assessing LLMs' generalisation performance, however, typically focuses on a single out-of-distribution dataset. This approach may fail to precisely evaluate the capabilities of the model, as the data shifts encountered during deployment are much more diverse. In this work, we investigate whether OOD generalisation results generalise. More specifically, we evaluate a model{'}s performance across multiple OOD testsets throughout a finetuning run; we then evaluate the partial correlation of performances across these testsets, regressing out in-domain performance. This allows us to assess how correlated are generalisation performances once in-domain performance is controlled for. Analysing OLMo, OPT and SmolLM, we observe no overarching trend in generalisation results: the existence of a positive or negative correlation between any two OOD testsets depends strongly on the specific choice of model analysed."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="boglioni-etal-2026-generalisation">
<titleInfo>
<title>Do Generalisation Results Generalise?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Boglioni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Sgobbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Tavernini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Rita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Mosbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>A large language model’s (LLM’s) out-of-distribution (OOD) generalisation is crucial to its deployment. Previous work assessing LLMs’ generalisation performance, however, typically focuses on a single out-of-distribution dataset. This approach may fail to precisely evaluate the capabilities of the model, as the data shifts encountered during deployment are much more diverse. In this work, we investigate whether OOD generalisation results generalise. More specifically, we evaluate a model’s performance across multiple OOD testsets throughout a finetuning run; we then evaluate the partial correlation of performances across these testsets, regressing out in-domain performance. This allows us to assess how correlated are generalisation performances once in-domain performance is controlled for. Analysing OLMo, OPT and SmolLM, we observe no overarching trend in generalisation results: the existence of a positive or negative correlation between any two OOD testsets depends strongly on the specific choice of model analysed.</abstract>
<identifier type="citekey">boglioni-etal-2026-generalisation</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1284/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>25763</start>
<end>25783</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Generalisation Results Generalise?
%A Boglioni, Matteo
%A Sgobbi, Andrea
%A Tavernini, Gabriel
%A Rita, Francesco
%A Mosbach, Marius
%A Pimentel, Tiago
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F boglioni-etal-2026-generalisation
%X A large language model’s (LLM’s) out-of-distribution (OOD) generalisation is crucial to its deployment. Previous work assessing LLMs’ generalisation performance, however, typically focuses on a single out-of-distribution dataset. This approach may fail to precisely evaluate the capabilities of the model, as the data shifts encountered during deployment are much more diverse. In this work, we investigate whether OOD generalisation results generalise. More specifically, we evaluate a model’s performance across multiple OOD testsets throughout a finetuning run; we then evaluate the partial correlation of performances across these testsets, regressing out in-domain performance. This allows us to assess how correlated are generalisation performances once in-domain performance is controlled for. Analysing OLMo, OPT and SmolLM, we observe no overarching trend in generalisation results: the existence of a positive or negative correlation between any two OOD testsets depends strongly on the specific choice of model analysed.
%U https://aclanthology.org/2026.findings-acl.1284/
%P 25763-25783
Markdown (Informal)
[Do Generalisation Results Generalise?](https://aclanthology.org/2026.findings-acl.1284/) (Boglioni et al., Findings 2026)
ACL
- Matteo Boglioni, Andrea Sgobbi, Gabriel Tavernini, Francesco Rita, Marius Mosbach, and Tiago Pimentel. 2026. Do Generalisation Results Generalise?. In Findings of the Association for Computational Linguistics: ACL 2026, pages 25763–25783, San Diego, California, United States. Association for Computational Linguistics.