@inproceedings{lv-etal-2023-lightformer,
title = "{L}ight{F}ormer: Light-weight Transformer Using {SVD}-based Weight Transfer and Parameter Sharing",
author = "Lv, Xiuqing and
Zhang, Peng and
Li, Sunzhu and
Gan, Guobing and
Sun, Yueheng",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.656/",
doi = "10.18653/v1/2023.findings-acl.656",
pages = "10323--10335",
abstract = "Transformer has become an important technique for natural language processing tasks with great success. However, it usually requires huge storage space and computational cost, making it difficult to be deployed on resource-constrained edge devices. To compress and accelerate Transformer, we propose LightFormer, which adopts a low-rank factorization initialized by SVD-based weight transfer and parameter sharing. The SVD-based weight transfer can effectively utilize the well-trained Transformer parameter knowledge to speed up the model convergence, and effectively alleviate the low-rank bottleneck problem combined with parameter sharing. We validate our method on machine translation, text summarization and text classification tasks. Experiments show that on IWSLT`14 De-En and WMT`14 En-De, LightFormer achieves similar performance to the baseline Transformer with 3.8 times and 1.8 times fewer parameters, and achieves 2.3 times speedup and 1.5 times speedup respectively, generally outperforming recent light-weight Transformers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lv-etal-2023-lightformer">
<titleInfo>
<title>LightFormer: Light-weight Transformer Using SVD-based Weight Transfer and Parameter Sharing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiuqing</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunzhu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guobing</namePart>
<namePart type="family">Gan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yueheng</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer has become an important technique for natural language processing tasks with great success. However, it usually requires huge storage space and computational cost, making it difficult to be deployed on resource-constrained edge devices. To compress and accelerate Transformer, we propose LightFormer, which adopts a low-rank factorization initialized by SVD-based weight transfer and parameter sharing. The SVD-based weight transfer can effectively utilize the well-trained Transformer parameter knowledge to speed up the model convergence, and effectively alleviate the low-rank bottleneck problem combined with parameter sharing. We validate our method on machine translation, text summarization and text classification tasks. Experiments show that on IWSLT‘14 De-En and WMT‘14 En-De, LightFormer achieves similar performance to the baseline Transformer with 3.8 times and 1.8 times fewer parameters, and achieves 2.3 times speedup and 1.5 times speedup respectively, generally outperforming recent light-weight Transformers.</abstract>
<identifier type="citekey">lv-etal-2023-lightformer</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.656</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.656/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>10323</start>
<end>10335</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LightFormer: Light-weight Transformer Using SVD-based Weight Transfer and Parameter Sharing
%A Lv, Xiuqing
%A Zhang, Peng
%A Li, Sunzhu
%A Gan, Guobing
%A Sun, Yueheng
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F lv-etal-2023-lightformer
%X Transformer has become an important technique for natural language processing tasks with great success. However, it usually requires huge storage space and computational cost, making it difficult to be deployed on resource-constrained edge devices. To compress and accelerate Transformer, we propose LightFormer, which adopts a low-rank factorization initialized by SVD-based weight transfer and parameter sharing. The SVD-based weight transfer can effectively utilize the well-trained Transformer parameter knowledge to speed up the model convergence, and effectively alleviate the low-rank bottleneck problem combined with parameter sharing. We validate our method on machine translation, text summarization and text classification tasks. Experiments show that on IWSLT‘14 De-En and WMT‘14 En-De, LightFormer achieves similar performance to the baseline Transformer with 3.8 times and 1.8 times fewer parameters, and achieves 2.3 times speedup and 1.5 times speedup respectively, generally outperforming recent light-weight Transformers.
%R 10.18653/v1/2023.findings-acl.656
%U https://aclanthology.org/2023.findings-acl.656/
%U https://doi.org/10.18653/v1/2023.findings-acl.656
%P 10323-10335
Markdown (Informal)
[LightFormer: Light-weight Transformer Using SVD-based Weight Transfer and Parameter Sharing](https://aclanthology.org/2023.findings-acl.656/) (Lv et al., Findings 2023)
ACL