@inproceedings{yi-etal-2025-fpe2m2,
title = "{FPE}2{M}2: Approaching Lossless and Efficient Quantization with Native Floating Point",
author = "Yi, Ke and
Zhang, Jianwei and
Xu, Zhiying and
Yang, Xinlong and
Zhou, Yang and
Sun, Minmin and
Liu, Zengke and
Zhang, Tong and
Lin, Junyang and
Zhou, Jingren",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.943/",
doi = "10.18653/v1/2025.findings-acl.943",
pages = "18350--18361",
ISBN = "979-8-89176-256-5",
abstract = "Auto-regressive decoding is a memory-bound job, meaning decoding inference performance is limited by the bandwidth rather than the computational capabilities of the GPU. Weight-only quantization is a promising method to address the memory-bound limitations. Previous studies have followed one of two approaches. Some have exclusively studied integer quantization while ignoring the Gaussian distribution nature of LLMs' weights. Others have proposed non-uniform quantization but incurred additional I/O overhead due to lookup tables, e.g. NF4. In this work, we extend the IEEE 754 float-point standard to the ExMy quantization schema, which allocates x bit for the exponent and y bit for the mantissa to represent a number. In terms of runtime efficiency, we demonstrate that the conversion from ExMy to FP16 can be realized through register-level operations, which can get almost the same performance as INT5. In terms of quantization loss, we analyze that of different ExMy settings, where the E2M2 schema achieves an optimal balance, offering the highest efficiency with lossless accuracy. We further propose the FPE2M2 framework that supports lossless weight-only quantization inference and validate the FPE2M2 framework on Qwen and LLaMA Models across various modalities, such as text, image, and audio tasks, which achieves a faster inference speed while maintaining nearly lossless accuracy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yi-etal-2025-fpe2m2">
<titleInfo>
<title>FPE2M2: Approaching Lossless and Efficient Quantization with Native Floating Point</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Yi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianwei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiying</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinlong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minmin</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zengke</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingren</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Auto-regressive decoding is a memory-bound job, meaning decoding inference performance is limited by the bandwidth rather than the computational capabilities of the GPU. Weight-only quantization is a promising method to address the memory-bound limitations. Previous studies have followed one of two approaches. Some have exclusively studied integer quantization while ignoring the Gaussian distribution nature of LLMs’ weights. Others have proposed non-uniform quantization but incurred additional I/O overhead due to lookup tables, e.g. NF4. In this work, we extend the IEEE 754 float-point standard to the ExMy quantization schema, which allocates x bit for the exponent and y bit for the mantissa to represent a number. In terms of runtime efficiency, we demonstrate that the conversion from ExMy to FP16 can be realized through register-level operations, which can get almost the same performance as INT5. In terms of quantization loss, we analyze that of different ExMy settings, where the E2M2 schema achieves an optimal balance, offering the highest efficiency with lossless accuracy. We further propose the FPE2M2 framework that supports lossless weight-only quantization inference and validate the FPE2M2 framework on Qwen and LLaMA Models across various modalities, such as text, image, and audio tasks, which achieves a faster inference speed while maintaining nearly lossless accuracy.</abstract>
<identifier type="citekey">yi-etal-2025-fpe2m2</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.943</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.943/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>18350</start>
<end>18361</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FPE2M2: Approaching Lossless and Efficient Quantization with Native Floating Point
%A Yi, Ke
%A Zhang, Jianwei
%A Xu, Zhiying
%A Yang, Xinlong
%A Zhou, Yang
%A Sun, Minmin
%A Liu, Zengke
%A Zhang, Tong
%A Lin, Junyang
%A Zhou, Jingren
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F yi-etal-2025-fpe2m2
%X Auto-regressive decoding is a memory-bound job, meaning decoding inference performance is limited by the bandwidth rather than the computational capabilities of the GPU. Weight-only quantization is a promising method to address the memory-bound limitations. Previous studies have followed one of two approaches. Some have exclusively studied integer quantization while ignoring the Gaussian distribution nature of LLMs’ weights. Others have proposed non-uniform quantization but incurred additional I/O overhead due to lookup tables, e.g. NF4. In this work, we extend the IEEE 754 float-point standard to the ExMy quantization schema, which allocates x bit for the exponent and y bit for the mantissa to represent a number. In terms of runtime efficiency, we demonstrate that the conversion from ExMy to FP16 can be realized through register-level operations, which can get almost the same performance as INT5. In terms of quantization loss, we analyze that of different ExMy settings, where the E2M2 schema achieves an optimal balance, offering the highest efficiency with lossless accuracy. We further propose the FPE2M2 framework that supports lossless weight-only quantization inference and validate the FPE2M2 framework on Qwen and LLaMA Models across various modalities, such as text, image, and audio tasks, which achieves a faster inference speed while maintaining nearly lossless accuracy.
%R 10.18653/v1/2025.findings-acl.943
%U https://aclanthology.org/2025.findings-acl.943/
%U https://doi.org/10.18653/v1/2025.findings-acl.943
%P 18350-18361
Markdown (Informal)
[FPE2M2: Approaching Lossless and Efficient Quantization with Native Floating Point](https://aclanthology.org/2025.findings-acl.943/) (Yi et al., Findings 2025)
ACL
- Ke Yi, Jianwei Zhang, Zhiying Xu, Xinlong Yang, Yang Zhou, Minmin Sun, Zengke Liu, Tong Zhang, Junyang Lin, and Jingren Zhou. 2025. FPE2M2: Approaching Lossless and Efficient Quantization with Native Floating Point. In Findings of the Association for Computational Linguistics: ACL 2025, pages 18350–18361, Vienna, Austria. Association for Computational Linguistics.