@inproceedings{jiang-etal-2025-improved,
title = "Improved Sparse Upcycling for Instruction Tuning",
author = "Jiang, Wangyi and
Lu, Yaojie and
Lin, Hongyu and
Han, Xianpei and
Sun, Le",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.636/",
pages = "9485--9498",
abstract = "The Mixture-of-Experts (MoE) architecture has demonstrated significant potential in both large-scale pre-training and instruction tuning by offering increased parameter capacity without additional inference costs. However, developing MoE models faces challenges including training instability and the need for substantial high-quality training data. While efficient methodologies like sparse upcycling exist, they often lead to performance degradation in instruction tuning scenarios. We introduce representation-based sparse upcycling, a straightforward yet effective technique for converting dense language models into sparsely activated ones while maintaining similar computational costs. Unlike conventional sparse upcycling, our approach leverages intermediate representations from language models to initialize router weights. This strategy addresses the mismatch between randomly initialized and well-trained parameters while providing prior knowledge to guide expert specialization during training. Extensive experiments across diverse benchmarks demonstrate significant improvements in both model capabilities and routing consistency compared to existing approaches."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2025-improved">
<titleInfo>
<title>Improved Sparse Upcycling for Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wangyi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaojie</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyu</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Le</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Mixture-of-Experts (MoE) architecture has demonstrated significant potential in both large-scale pre-training and instruction tuning by offering increased parameter capacity without additional inference costs. However, developing MoE models faces challenges including training instability and the need for substantial high-quality training data. While efficient methodologies like sparse upcycling exist, they often lead to performance degradation in instruction tuning scenarios. We introduce representation-based sparse upcycling, a straightforward yet effective technique for converting dense language models into sparsely activated ones while maintaining similar computational costs. Unlike conventional sparse upcycling, our approach leverages intermediate representations from language models to initialize router weights. This strategy addresses the mismatch between randomly initialized and well-trained parameters while providing prior knowledge to guide expert specialization during training. Extensive experiments across diverse benchmarks demonstrate significant improvements in both model capabilities and routing consistency compared to existing approaches.</abstract>
<identifier type="citekey">jiang-etal-2025-improved</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.636/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>9485</start>
<end>9498</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improved Sparse Upcycling for Instruction Tuning
%A Jiang, Wangyi
%A Lu, Yaojie
%A Lin, Hongyu
%A Han, Xianpei
%A Sun, Le
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F jiang-etal-2025-improved
%X The Mixture-of-Experts (MoE) architecture has demonstrated significant potential in both large-scale pre-training and instruction tuning by offering increased parameter capacity without additional inference costs. However, developing MoE models faces challenges including training instability and the need for substantial high-quality training data. While efficient methodologies like sparse upcycling exist, they often lead to performance degradation in instruction tuning scenarios. We introduce representation-based sparse upcycling, a straightforward yet effective technique for converting dense language models into sparsely activated ones while maintaining similar computational costs. Unlike conventional sparse upcycling, our approach leverages intermediate representations from language models to initialize router weights. This strategy addresses the mismatch between randomly initialized and well-trained parameters while providing prior knowledge to guide expert specialization during training. Extensive experiments across diverse benchmarks demonstrate significant improvements in both model capabilities and routing consistency compared to existing approaches.
%U https://aclanthology.org/2025.coling-main.636/
%P 9485-9498
Markdown (Informal)
[Improved Sparse Upcycling for Instruction Tuning](https://aclanthology.org/2025.coling-main.636/) (Jiang et al., COLING 2025)
ACL
- Wangyi Jiang, Yaojie Lu, Hongyu Lin, Xianpei Han, and Le Sun. 2025. Improved Sparse Upcycling for Instruction Tuning. In Proceedings of the 31st International Conference on Computational Linguistics, pages 9485–9498, Abu Dhabi, UAE. Association for Computational Linguistics.