@inproceedings{aycock-etal-2026-limits,
title = "On the Limits of Model Merging for Multilinguality in Pre-Training",
author = "Aycock, Seth and
Vitiugin, Fedor and
Umnov, Aleksandr and
Monz, Christof and
Sima{'}an, Khalil",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mellm-1.15/",
pages = "159--169",
ISBN = "979-8-89176-430-9",
abstract = "Endowing models with consistent multilingual performance can be achieved by {\_}mixing{\_} pre-training data, or post-training approaches such as language-specific model {\_}merging{\_}. In this work, we test whether merging can be applied to monolingually pre-trained models. We conduct a controlled study on the efficacy of mixed, merged, and monolingual pre-training setups. We find that while monolingual pre-training results in strong in-language performance, merging any combination of monolingual models leads to performance collapse due to interference. Our analysis suggests representational similarity is a prerequisite for model merging. We therefore conclude that the flexibility of merging in fine-tuning does not extend trivially to language-specific pre-training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aycock-etal-2026-limits">
<titleInfo>
<title>On the Limits of Model Merging for Multilinguality in Pre-Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seth</namePart>
<namePart type="family">Aycock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fedor</namePart>
<namePart type="family">Vitiugin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksandr</namePart>
<namePart type="family">Umnov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Sima’an</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaiyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengran</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-430-9</identifier>
</relatedItem>
<abstract>Endowing models with consistent multilingual performance can be achieved by _mixing_ pre-training data, or post-training approaches such as language-specific model _merging_. In this work, we test whether merging can be applied to monolingually pre-trained models. We conduct a controlled study on the efficacy of mixed, merged, and monolingual pre-training setups. We find that while monolingual pre-training results in strong in-language performance, merging any combination of monolingual models leads to performance collapse due to interference. Our analysis suggests representational similarity is a prerequisite for model merging. We therefore conclude that the flexibility of merging in fine-tuning does not extend trivially to language-specific pre-training.</abstract>
<identifier type="citekey">aycock-etal-2026-limits</identifier>
<location>
<url>https://aclanthology.org/2026.mellm-1.15/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>159</start>
<end>169</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Limits of Model Merging for Multilinguality in Pre-Training
%A Aycock, Seth
%A Vitiugin, Fedor
%A Umnov, Aleksandr
%A Monz, Christof
%A Sima’an, Khalil
%Y Huang, Kaiyu
%Y Mo, Fengran
%Y Chen, Pinzhen
%Y Jiang, Meng
%S Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-430-9
%F aycock-etal-2026-limits
%X Endowing models with consistent multilingual performance can be achieved by _mixing_ pre-training data, or post-training approaches such as language-specific model _merging_. In this work, we test whether merging can be applied to monolingually pre-trained models. We conduct a controlled study on the efficacy of mixed, merged, and monolingual pre-training setups. We find that while monolingual pre-training results in strong in-language performance, merging any combination of monolingual models leads to performance collapse due to interference. Our analysis suggests representational similarity is a prerequisite for model merging. We therefore conclude that the flexibility of merging in fine-tuning does not extend trivially to language-specific pre-training.
%U https://aclanthology.org/2026.mellm-1.15/
%P 159-169
Markdown (Informal)
[On the Limits of Model Merging for Multilinguality in Pre-Training](https://aclanthology.org/2026.mellm-1.15/) (Aycock et al., MeLLM 2026)
ACL
- Seth Aycock, Fedor Vitiugin, Aleksandr Umnov, Christof Monz, and Khalil Sima’an. 2026. On the Limits of Model Merging for Multilinguality in Pre-Training. In Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026), pages 159–169, San Diego, United States. Association for Computational Linguistics.