@inproceedings{zhang-etal-2024-afpq,
title = "{AFPQ}: Asymmetric Floating Point Quantization for {LLM}s",
author = "Zhang, Yijia and
Zhang, Sicheng and
Cao, Shijie and
Du, DaYou and
Wei, Jianyu and
Cao, Ting and
Xu, Ningyi",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.3",
doi = "10.18653/v1/2024.findings-acl.3",
pages = "28--36",
abstract = "Large language models (LLMs) show great performance in various tasks, but face deployment challenges from limited memory capacity and bandwidth.Low-bit weight quantization can save memory and accelerate inference.Although floating-point (FP) formats show good performance in LLM quantization, they tend to perform poorly with small group sizes or sub-4 bits.We find the reason is that the absence of asymmetry in previous FP quantization makes it unsuitable for handling asymmetric value distribution of LLM weight tensors.In this work, we propose asymmetric FP quantization (AFPQ), which sets separate scales for positive and negative values.Our method leads to large accuracy improvements and can be easily plugged into other quantization methods, including GPTQ and AWQ, for better performance.Besides, no additional storage is needed compared with asymmetric integer (INT) quantization.The code is available at https://github.com/zhangsichengsjtu/AFPQ.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-afpq">
<titleInfo>
<title>AFPQ: Asymmetric Floating Point Quantization for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yijia</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sicheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shijie</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">DaYou</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianyu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ningyi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) show great performance in various tasks, but face deployment challenges from limited memory capacity and bandwidth.Low-bit weight quantization can save memory and accelerate inference.Although floating-point (FP) formats show good performance in LLM quantization, they tend to perform poorly with small group sizes or sub-4 bits.We find the reason is that the absence of asymmetry in previous FP quantization makes it unsuitable for handling asymmetric value distribution of LLM weight tensors.In this work, we propose asymmetric FP quantization (AFPQ), which sets separate scales for positive and negative values.Our method leads to large accuracy improvements and can be easily plugged into other quantization methods, including GPTQ and AWQ, for better performance.Besides, no additional storage is needed compared with asymmetric integer (INT) quantization.The code is available at https://github.com/zhangsichengsjtu/AFPQ.</abstract>
<identifier type="citekey">zhang-etal-2024-afpq</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.3</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.3</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>28</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AFPQ: Asymmetric Floating Point Quantization for LLMs
%A Zhang, Yijia
%A Zhang, Sicheng
%A Cao, Shijie
%A Du, DaYou
%A Wei, Jianyu
%A Cao, Ting
%A Xu, Ningyi
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F zhang-etal-2024-afpq
%X Large language models (LLMs) show great performance in various tasks, but face deployment challenges from limited memory capacity and bandwidth.Low-bit weight quantization can save memory and accelerate inference.Although floating-point (FP) formats show good performance in LLM quantization, they tend to perform poorly with small group sizes or sub-4 bits.We find the reason is that the absence of asymmetry in previous FP quantization makes it unsuitable for handling asymmetric value distribution of LLM weight tensors.In this work, we propose asymmetric FP quantization (AFPQ), which sets separate scales for positive and negative values.Our method leads to large accuracy improvements and can be easily plugged into other quantization methods, including GPTQ and AWQ, for better performance.Besides, no additional storage is needed compared with asymmetric integer (INT) quantization.The code is available at https://github.com/zhangsichengsjtu/AFPQ.
%R 10.18653/v1/2024.findings-acl.3
%U https://aclanthology.org/2024.findings-acl.3
%U https://doi.org/10.18653/v1/2024.findings-acl.3
%P 28-36
Markdown (Informal)
[AFPQ: Asymmetric Floating Point Quantization for LLMs](https://aclanthology.org/2024.findings-acl.3) (Zhang et al., Findings 2024)
ACL
- Yijia Zhang, Sicheng Zhang, Shijie Cao, DaYou Du, Jianyu Wei, Ting Cao, and Ningyi Xu. 2024. AFPQ: Asymmetric Floating Point Quantization for LLMs. In Findings of the Association for Computational Linguistics: ACL 2024, pages 28–36, Bangkok, Thailand. Association for Computational Linguistics.