@inproceedings{ma-etal-2024-sparsity,
title = "Sparsity-Accelerated Training for Large Language Models",
author = "Ma, Da and
Chen, Lu and
Wang, Pengyu and
Xu, Hongshen and
Li, Hanqi and
Sun, Liangtai and
Zhu, Su and
Fan, Shuai and
Yu, Kai",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.875",
doi = "10.18653/v1/2024.findings-acl.875",
pages = "14696--14707",
abstract = "Large language models (LLMs) have demonstrated proficiency across various natural language processing (NLP) tasks but often require additional training, such as continual pre-training and supervised fine-tuning. However, the costs associated with this, primarily due to their large parameter count, remain high. This paper proposes leveraging \textit{sparsity} in pre-trained LLMs to expedite this training process. By observing sparsity in activated neurons during forward iterations, we identify the potential for computational speed-ups by excluding inactive neurons. We address associated challenges by extending existing neuron importance evaluation metrics and introducing a ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that Sparsity-Accelerated Training (SAT) achieves comparable or superior performance to standard training while significantly accelerating the process. Specifically, SAT achieves a 45{\%} throughput improvement in continual pre-training and saves 38{\%} training time in supervised fine-tuning. It offers a simple, hardware-agnostic, and easily deployable framework for additional LLM training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-etal-2024-sparsity">
<titleInfo>
<title>Sparsity-Accelerated Training for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Da</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongshen</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liangtai</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Su</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) have demonstrated proficiency across various natural language processing (NLP) tasks but often require additional training, such as continual pre-training and supervised fine-tuning. However, the costs associated with this, primarily due to their large parameter count, remain high. This paper proposes leveraging sparsity in pre-trained LLMs to expedite this training process. By observing sparsity in activated neurons during forward iterations, we identify the potential for computational speed-ups by excluding inactive neurons. We address associated challenges by extending existing neuron importance evaluation metrics and introducing a ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that Sparsity-Accelerated Training (SAT) achieves comparable or superior performance to standard training while significantly accelerating the process. Specifically, SAT achieves a 45% throughput improvement in continual pre-training and saves 38% training time in supervised fine-tuning. It offers a simple, hardware-agnostic, and easily deployable framework for additional LLM training.</abstract>
<identifier type="citekey">ma-etal-2024-sparsity</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.875</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.875</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>14696</start>
<end>14707</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sparsity-Accelerated Training for Large Language Models
%A Ma, Da
%A Chen, Lu
%A Wang, Pengyu
%A Xu, Hongshen
%A Li, Hanqi
%A Sun, Liangtai
%A Zhu, Su
%A Fan, Shuai
%A Yu, Kai
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand and virtual meeting
%F ma-etal-2024-sparsity
%X Large language models (LLMs) have demonstrated proficiency across various natural language processing (NLP) tasks but often require additional training, such as continual pre-training and supervised fine-tuning. However, the costs associated with this, primarily due to their large parameter count, remain high. This paper proposes leveraging sparsity in pre-trained LLMs to expedite this training process. By observing sparsity in activated neurons during forward iterations, we identify the potential for computational speed-ups by excluding inactive neurons. We address associated challenges by extending existing neuron importance evaluation metrics and introducing a ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that Sparsity-Accelerated Training (SAT) achieves comparable or superior performance to standard training while significantly accelerating the process. Specifically, SAT achieves a 45% throughput improvement in continual pre-training and saves 38% training time in supervised fine-tuning. It offers a simple, hardware-agnostic, and easily deployable framework for additional LLM training.
%R 10.18653/v1/2024.findings-acl.875
%U https://aclanthology.org/2024.findings-acl.875
%U https://doi.org/10.18653/v1/2024.findings-acl.875
%P 14696-14707
Markdown (Informal)
[Sparsity-Accelerated Training for Large Language Models](https://aclanthology.org/2024.findings-acl.875) (Ma et al., Findings 2024)
ACL
- Da Ma, Lu Chen, Pengyu Wang, Hongshen Xu, Hanqi Li, Liangtai Sun, Su Zhu, Shuai Fan, and Kai Yu. 2024. Sparsity-Accelerated Training for Large Language Models. In Findings of the Association for Computational Linguistics ACL 2024, pages 14696–14707, Bangkok, Thailand and virtual meeting. Association for Computational Linguistics.