@inproceedings{pang-etal-2026-htmuon,
title = "{HTM}uon: Improving Muon via Heavy-Tailed Spectral Correction",
author = "Pang, Tianyu and
Fang, Yujie and
Liu, Zihang and
Deng, Shenyang and
Hsiung, Lei and
Yu, Shuhua and
Yang, Yaoqing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1819/",
pages = "36504--36535",
ISBN = "979-8-89176-395-1",
abstract = "Muon has recently shown promising results in LLM training. In this work, we study how to further improve Muon. We argue that Muon{'}s orthogonalized update rule suppresses the emergence of heavy-tailed weight spectra and over-emphasizes the training along noise-dominated directions. Motivated by the Heavy-Tailed Self-Regularization (HT-SR) theory, we propose HTMuon. HTMuon preserves Muon{'}s ability to capture parameter interdependencies while producing heavier-tailed updates and inducing heavier-tailed weight spectra. Experiments on LLM pretraining and image classification show that HTMuon consistently improves performance over state-of-the-art baselines and can also serve as a plug-in on top of existing Muon variants. For example, on LLaMA pretraining on the C4 dataset, HTMuon reduces perplexity by up to 0.98 compared to Muon. We further theoretically show that HTMuon corresponds to steepest descent under the Schatten-$q$ norm constraint and provide convergence analysis in smooth non-convex settings. The implementation of HTMuon is available at \url{https://github.com/TDCSZ327/HTmuon}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pang-etal-2026-htmuon">
<titleInfo>
<title>HTMuon: Improving Muon via Heavy-Tailed Spectral Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyu</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujie</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shenyang</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Hsiung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuhua</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaoqing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Muon has recently shown promising results in LLM training. In this work, we study how to further improve Muon. We argue that Muon’s orthogonalized update rule suppresses the emergence of heavy-tailed weight spectra and over-emphasizes the training along noise-dominated directions. Motivated by the Heavy-Tailed Self-Regularization (HT-SR) theory, we propose HTMuon. HTMuon preserves Muon’s ability to capture parameter interdependencies while producing heavier-tailed updates and inducing heavier-tailed weight spectra. Experiments on LLM pretraining and image classification show that HTMuon consistently improves performance over state-of-the-art baselines and can also serve as a plug-in on top of existing Muon variants. For example, on LLaMA pretraining on the C4 dataset, HTMuon reduces perplexity by up to 0.98 compared to Muon. We further theoretically show that HTMuon corresponds to steepest descent under the Schatten-q norm constraint and provide convergence analysis in smooth non-convex settings. The implementation of HTMuon is available at https://github.com/TDCSZ327/HTmuon.</abstract>
<identifier type="citekey">pang-etal-2026-htmuon</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1819/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36504</start>
<end>36535</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HTMuon: Improving Muon via Heavy-Tailed Spectral Correction
%A Pang, Tianyu
%A Fang, Yujie
%A Liu, Zihang
%A Deng, Shenyang
%A Hsiung, Lei
%A Yu, Shuhua
%A Yang, Yaoqing
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F pang-etal-2026-htmuon
%X Muon has recently shown promising results in LLM training. In this work, we study how to further improve Muon. We argue that Muon’s orthogonalized update rule suppresses the emergence of heavy-tailed weight spectra and over-emphasizes the training along noise-dominated directions. Motivated by the Heavy-Tailed Self-Regularization (HT-SR) theory, we propose HTMuon. HTMuon preserves Muon’s ability to capture parameter interdependencies while producing heavier-tailed updates and inducing heavier-tailed weight spectra. Experiments on LLM pretraining and image classification show that HTMuon consistently improves performance over state-of-the-art baselines and can also serve as a plug-in on top of existing Muon variants. For example, on LLaMA pretraining on the C4 dataset, HTMuon reduces perplexity by up to 0.98 compared to Muon. We further theoretically show that HTMuon corresponds to steepest descent under the Schatten-q norm constraint and provide convergence analysis in smooth non-convex settings. The implementation of HTMuon is available at https://github.com/TDCSZ327/HTmuon.
%U https://aclanthology.org/2026.findings-acl.1819/
%P 36504-36535
Markdown (Informal)
[HTMuon: Improving Muon via Heavy-Tailed Spectral Correction](https://aclanthology.org/2026.findings-acl.1819/) (Pang et al., Findings 2026)
ACL
- Tianyu Pang, Yujie Fang, Zihang Liu, Shenyang Deng, Lei Hsiung, Shuhua Yu, and Yaoqing Yang. 2026. HTMuon: Improving Muon via Heavy-Tailed Spectral Correction. In Findings of the Association for Computational Linguistics: ACL 2026, pages 36504–36535, San Diego, California, United States. Association for Computational Linguistics.