@inproceedings{chitsaz-etal-2024-exploring,
title = "Exploring Quantization for Efficient Pre-Training of Transformer Language Models",
author = "Chitsaz, Kamran and
Fournier, Quentin and
Mordido, Goncalo and
Chandar, Sarath",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.787/",
doi = "10.18653/v1/2024.findings-emnlp.787",
pages = "13473--13487",
abstract = "The increasing scale of Transformer models has led to an increase in their pre-training computational requirements. While quantization has proven to be effective after pre-training and during fine-tuning, applying quantization in Transformers during pre-training has remained largely unexplored at scale for language modeling. This study aims to explore the impact of quantization for efficient pre-training of Transformers, with a focus on linear layer components. By systematically applying straightforward linear quantization to weights, activations, gradients, and optimizer states, we assess its effects on model efficiency, stability, and performance during training. By offering a comprehensive recipe of effective quantization strategies to be applied during the pre-training of Transformers, we promote high training efficiency from scratch while retaining language modeling ability."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chitsaz-etal-2024-exploring">
<titleInfo>
<title>Exploring Quantization for Efficient Pre-Training of Transformer Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kamran</namePart>
<namePart type="family">Chitsaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quentin</namePart>
<namePart type="family">Fournier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goncalo</namePart>
<namePart type="family">Mordido</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarath</namePart>
<namePart type="family">Chandar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The increasing scale of Transformer models has led to an increase in their pre-training computational requirements. While quantization has proven to be effective after pre-training and during fine-tuning, applying quantization in Transformers during pre-training has remained largely unexplored at scale for language modeling. This study aims to explore the impact of quantization for efficient pre-training of Transformers, with a focus on linear layer components. By systematically applying straightforward linear quantization to weights, activations, gradients, and optimizer states, we assess its effects on model efficiency, stability, and performance during training. By offering a comprehensive recipe of effective quantization strategies to be applied during the pre-training of Transformers, we promote high training efficiency from scratch while retaining language modeling ability.</abstract>
<identifier type="citekey">chitsaz-etal-2024-exploring</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.787</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.787/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>13473</start>
<end>13487</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Quantization for Efficient Pre-Training of Transformer Language Models
%A Chitsaz, Kamran
%A Fournier, Quentin
%A Mordido, Goncalo
%A Chandar, Sarath
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chitsaz-etal-2024-exploring
%X The increasing scale of Transformer models has led to an increase in their pre-training computational requirements. While quantization has proven to be effective after pre-training and during fine-tuning, applying quantization in Transformers during pre-training has remained largely unexplored at scale for language modeling. This study aims to explore the impact of quantization for efficient pre-training of Transformers, with a focus on linear layer components. By systematically applying straightforward linear quantization to weights, activations, gradients, and optimizer states, we assess its effects on model efficiency, stability, and performance during training. By offering a comprehensive recipe of effective quantization strategies to be applied during the pre-training of Transformers, we promote high training efficiency from scratch while retaining language modeling ability.
%R 10.18653/v1/2024.findings-emnlp.787
%U https://aclanthology.org/2024.findings-emnlp.787/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.787
%P 13473-13487
Markdown (Informal)
[Exploring Quantization for Efficient Pre-Training of Transformer Language Models](https://aclanthology.org/2024.findings-emnlp.787/) (Chitsaz et al., Findings 2024)
ACL