@inproceedings{tayaranian-hosseini-etal-2023-towards,
title = "Towards Fine-tuning Pre-trained Language Models with Integer Forward and Backward Propagation",
author = "Tayaranian Hosseini, Mohammadreza and
Ghaffari, Alireza and
Tahaei, Marzieh S. and
Rezagholizadeh, Mehdi and
Asgharian, Masoud and
Partovi Nia, Vahid",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.143",
doi = "10.18653/v1/2023.findings-eacl.143",
pages = "1912--1921",
abstract = "The large number of parameters of some prominent language models, such as BERT, makes their fine-tuning on downstream tasks computationally intensive and energy hungry. Previously researchers were focused on lower bit-width integer data types for the forward propagation of language models to save memory and computation. As for the backward propagation, however, only 16-bit floating-point data type has been used for the fine-tuning of BERT.In this work, we use integer arithmetic for both forward and back propagation in the fine-tuning of BERT.We study the effects of varying the integer bit-width on the model{'}s metric performance. Our integer fine-tuning uses integer arithmetic to perform forward propagation and gradient computation of linear, layer-norm, and embedding layers of BERT.We fine-tune BERT using our integer training method on SQuAD v1.1 and SQuAD v2., and GLUE benchmark. We demonstrate that metric performance of fine-tuning 16-bit integer BERT matches both 16-bit and 32-bit floating-point baselines. Furthermore, using the faster and more memory efficient 8-bit integer data type, integer fine-tuning of BERT loses an average of 3.1 points compared to the FP32 baseline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tayaranian-hosseini-etal-2023-towards">
<titleInfo>
<title>Towards Fine-tuning Pre-trained Language Models with Integer Forward and Backward Propagation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammadreza</namePart>
<namePart type="family">Tayaranian Hosseini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alireza</namePart>
<namePart type="family">Ghaffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Tahaei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Rezagholizadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masoud</namePart>
<namePart type="family">Asgharian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vahid</namePart>
<namePart type="family">Partovi Nia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The large number of parameters of some prominent language models, such as BERT, makes their fine-tuning on downstream tasks computationally intensive and energy hungry. Previously researchers were focused on lower bit-width integer data types for the forward propagation of language models to save memory and computation. As for the backward propagation, however, only 16-bit floating-point data type has been used for the fine-tuning of BERT.In this work, we use integer arithmetic for both forward and back propagation in the fine-tuning of BERT.We study the effects of varying the integer bit-width on the model’s metric performance. Our integer fine-tuning uses integer arithmetic to perform forward propagation and gradient computation of linear, layer-norm, and embedding layers of BERT.We fine-tune BERT using our integer training method on SQuAD v1.1 and SQuAD v2., and GLUE benchmark. We demonstrate that metric performance of fine-tuning 16-bit integer BERT matches both 16-bit and 32-bit floating-point baselines. Furthermore, using the faster and more memory efficient 8-bit integer data type, integer fine-tuning of BERT loses an average of 3.1 points compared to the FP32 baseline.</abstract>
<identifier type="citekey">tayaranian-hosseini-etal-2023-towards</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.143</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.143</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1912</start>
<end>1921</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Fine-tuning Pre-trained Language Models with Integer Forward and Backward Propagation
%A Tayaranian Hosseini, Mohammadreza
%A Ghaffari, Alireza
%A Tahaei, Marzieh S.
%A Rezagholizadeh, Mehdi
%A Asgharian, Masoud
%A Partovi Nia, Vahid
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F tayaranian-hosseini-etal-2023-towards
%X The large number of parameters of some prominent language models, such as BERT, makes their fine-tuning on downstream tasks computationally intensive and energy hungry. Previously researchers were focused on lower bit-width integer data types for the forward propagation of language models to save memory and computation. As for the backward propagation, however, only 16-bit floating-point data type has been used for the fine-tuning of BERT.In this work, we use integer arithmetic for both forward and back propagation in the fine-tuning of BERT.We study the effects of varying the integer bit-width on the model’s metric performance. Our integer fine-tuning uses integer arithmetic to perform forward propagation and gradient computation of linear, layer-norm, and embedding layers of BERT.We fine-tune BERT using our integer training method on SQuAD v1.1 and SQuAD v2., and GLUE benchmark. We demonstrate that metric performance of fine-tuning 16-bit integer BERT matches both 16-bit and 32-bit floating-point baselines. Furthermore, using the faster and more memory efficient 8-bit integer data type, integer fine-tuning of BERT loses an average of 3.1 points compared to the FP32 baseline.
%R 10.18653/v1/2023.findings-eacl.143
%U https://aclanthology.org/2023.findings-eacl.143
%U https://doi.org/10.18653/v1/2023.findings-eacl.143
%P 1912-1921
Markdown (Informal)
[Towards Fine-tuning Pre-trained Language Models with Integer Forward and Backward Propagation](https://aclanthology.org/2023.findings-eacl.143) (Tayaranian Hosseini et al., Findings 2023)
ACL