@inproceedings{du-etal-2026-half,
title = "Half-{S}: Halving the Scale for Near-Lossless 4-Bit {LLM} Training",
author = "Du, Jinyang and
Gong, Ruihao and
Ai, Linghan and
Wang, Zining and
Peng, Yunke and
Wang, Yao and
Yan, Lei and
Wxuefei and
Wang, Yaoyuan and
Guo, Jinyang and
Lin, Dahua and
Liu, Xianglong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.241/",
pages = "4890--4903",
ISBN = "979-8-89176-395-1",
abstract = "Training large language models (LLMs) at 4-bit precision offers substantial efficiency gains but remains challenging due to the limited dynamic range and coarse numerical resolution. Existing 4-bit training pipelines typically rely on max-scaling, which is ill-suited for heavy-tailed LLM tensor distributions and leads to severe under-utilization of the FP4 quantization grid in the low-magnitude region. This effect causes pronounced \textit{representation collapse} and large rounding errors for the values that dominate LLM computation. In this work, we derive the theoretically optimal scaling for FP4 under heavy-tailed inputs, revealing why max-scaling is intrinsically suboptimal. Guided by this analysis, we propose \textbf{Half-S}, a simple and efficient scaling strategy that uses half-scaling as a hardware-friendly default and falls back to an MSE-based clipping threshold when needed, yielding a close approximation to the theoretical optimum under real LLM statistics. Extensive experiments on large-scale pretraining and downstream fine-tuning show that Half-S consistently narrows the gap to BF16 in both convergence and final model quality, while preserving the efficiency benefits of 4-bit computation. Under native FP4 support, Half-S is estimated to provide up to \textbf{1.8$\times$} end-to-end training speedup. These results indicate that Half-S provides a simple and effective correction to max-scaling, substantially improving the stability and accuracy of 4-bit LLM training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-etal-2026-half">
<titleInfo>
<title>Half-S: Halving the Scale for Near-Lossless 4-Bit LLM Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jinyang</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruihao</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linghan</namePart>
<namePart type="family">Ai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zining</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunke</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Wxuefei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaoyuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyang</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dahua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianglong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Training large language models (LLMs) at 4-bit precision offers substantial efficiency gains but remains challenging due to the limited dynamic range and coarse numerical resolution. Existing 4-bit training pipelines typically rely on max-scaling, which is ill-suited for heavy-tailed LLM tensor distributions and leads to severe under-utilization of the FP4 quantization grid in the low-magnitude region. This effect causes pronounced representation collapse and large rounding errors for the values that dominate LLM computation. In this work, we derive the theoretically optimal scaling for FP4 under heavy-tailed inputs, revealing why max-scaling is intrinsically suboptimal. Guided by this analysis, we propose Half-S, a simple and efficient scaling strategy that uses half-scaling as a hardware-friendly default and falls back to an MSE-based clipping threshold when needed, yielding a close approximation to the theoretical optimum under real LLM statistics. Extensive experiments on large-scale pretraining and downstream fine-tuning show that Half-S consistently narrows the gap to BF16 in both convergence and final model quality, while preserving the efficiency benefits of 4-bit computation. Under native FP4 support, Half-S is estimated to provide up to 1.8\times end-to-end training speedup. These results indicate that Half-S provides a simple and effective correction to max-scaling, substantially improving the stability and accuracy of 4-bit LLM training.</abstract>
<identifier type="citekey">du-etal-2026-half</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.241/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>4890</start>
<end>4903</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Half-S: Halving the Scale for Near-Lossless 4-Bit LLM Training
%A Du, Jinyang
%A Gong, Ruihao
%A Ai, Linghan
%A Wang, Zining
%A Peng, Yunke
%A Wang, Yao
%A Yan, Lei
%A Wang, Yaoyuan
%A Guo, Jinyang
%A Lin, Dahua
%A Liu, Xianglong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Wxuefei
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F du-etal-2026-half
%X Training large language models (LLMs) at 4-bit precision offers substantial efficiency gains but remains challenging due to the limited dynamic range and coarse numerical resolution. Existing 4-bit training pipelines typically rely on max-scaling, which is ill-suited for heavy-tailed LLM tensor distributions and leads to severe under-utilization of the FP4 quantization grid in the low-magnitude region. This effect causes pronounced representation collapse and large rounding errors for the values that dominate LLM computation. In this work, we derive the theoretically optimal scaling for FP4 under heavy-tailed inputs, revealing why max-scaling is intrinsically suboptimal. Guided by this analysis, we propose Half-S, a simple and efficient scaling strategy that uses half-scaling as a hardware-friendly default and falls back to an MSE-based clipping threshold when needed, yielding a close approximation to the theoretical optimum under real LLM statistics. Extensive experiments on large-scale pretraining and downstream fine-tuning show that Half-S consistently narrows the gap to BF16 in both convergence and final model quality, while preserving the efficiency benefits of 4-bit computation. Under native FP4 support, Half-S is estimated to provide up to 1.8\times end-to-end training speedup. These results indicate that Half-S provides a simple and effective correction to max-scaling, substantially improving the stability and accuracy of 4-bit LLM training.
%U https://aclanthology.org/2026.findings-acl.241/
%P 4890-4903
Markdown (Informal)
[Half-S: Halving the Scale for Near-Lossless 4-Bit LLM Training](https://aclanthology.org/2026.findings-acl.241/) (Du et al., Findings 2026)
ACL
- Jinyang Du, Ruihao Gong, Linghan Ai, Zining Wang, Yunke Peng, Yao Wang, Lei Yan, Wxuefei, Yaoyuan Wang, Jinyang Guo, Dahua Lin, and Xianglong Liu. 2026. Half-S: Halving the Scale for Near-Lossless 4-Bit LLM Training. In Findings of the Association for Computational Linguistics: ACL 2026, pages 4890–4903, San Diego, California, United States. Association for Computational Linguistics.