@inproceedings{varshney-etal-2024-investigating,
title = "Investigating Acceleration of {LL}a{MA} Inference by Enabling Intermediate Layer Decoding via Instruction Tuning with {`}{LITE}{'}",
author = "Varshney, Neeraj and
Chatterjee, Agneet and
Parmar, Mihir and
Baral, Chitta",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.232",
doi = "10.18653/v1/2024.findings-naacl.232",
pages = "3656--3677",
abstract = "Large Language Models (LLMs) have achieved remarkable performance across a wide variety of tasks; however, their large size makes their inference slow and computationally expensive. Focusing on this problem, we study instruction tuning LLMs with additional explicit Losses from the Intermediate layers (LITE) and show that it enables these layers to acquire {`}good{'} generation ability without affecting the generation ability of the final layer. We then perform {`}dynamic confidence-based early exiting{'} at token level from the intermediate layers which improves the computational efficiency of text generation without sacrificing the quality of the generation. We conduct comprehensive experiments by instruction tuning LLaMA-2 models on the Alpaca dataset and evaluate on four different instruction test sets. We show that dynamic early exiting achieves consistent and considerable inference cost improvements (37.86{\%} for 7B and 46.35{\%} for 13B model) while maintaining the generation quality. We further conduct a thorough analysis of the results and dissect the efficiency improvements which reveals several important findings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="varshney-etal-2024-investigating">
<titleInfo>
<title>Investigating Acceleration of LLaMA Inference by Enabling Intermediate Layer Decoding via Instruction Tuning with ‘LITE’</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neeraj</namePart>
<namePart type="family">Varshney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agneet</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihir</namePart>
<namePart type="family">Parmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chitta</namePart>
<namePart type="family">Baral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have achieved remarkable performance across a wide variety of tasks; however, their large size makes their inference slow and computationally expensive. Focusing on this problem, we study instruction tuning LLMs with additional explicit Losses from the Intermediate layers (LITE) and show that it enables these layers to acquire ‘good’ generation ability without affecting the generation ability of the final layer. We then perform ‘dynamic confidence-based early exiting’ at token level from the intermediate layers which improves the computational efficiency of text generation without sacrificing the quality of the generation. We conduct comprehensive experiments by instruction tuning LLaMA-2 models on the Alpaca dataset and evaluate on four different instruction test sets. We show that dynamic early exiting achieves consistent and considerable inference cost improvements (37.86% for 7B and 46.35% for 13B model) while maintaining the generation quality. We further conduct a thorough analysis of the results and dissect the efficiency improvements which reveals several important findings.</abstract>
<identifier type="citekey">varshney-etal-2024-investigating</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.232</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.232</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3656</start>
<end>3677</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating Acceleration of LLaMA Inference by Enabling Intermediate Layer Decoding via Instruction Tuning with ‘LITE’
%A Varshney, Neeraj
%A Chatterjee, Agneet
%A Parmar, Mihir
%A Baral, Chitta
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F varshney-etal-2024-investigating
%X Large Language Models (LLMs) have achieved remarkable performance across a wide variety of tasks; however, their large size makes their inference slow and computationally expensive. Focusing on this problem, we study instruction tuning LLMs with additional explicit Losses from the Intermediate layers (LITE) and show that it enables these layers to acquire ‘good’ generation ability without affecting the generation ability of the final layer. We then perform ‘dynamic confidence-based early exiting’ at token level from the intermediate layers which improves the computational efficiency of text generation without sacrificing the quality of the generation. We conduct comprehensive experiments by instruction tuning LLaMA-2 models on the Alpaca dataset and evaluate on four different instruction test sets. We show that dynamic early exiting achieves consistent and considerable inference cost improvements (37.86% for 7B and 46.35% for 13B model) while maintaining the generation quality. We further conduct a thorough analysis of the results and dissect the efficiency improvements which reveals several important findings.
%R 10.18653/v1/2024.findings-naacl.232
%U https://aclanthology.org/2024.findings-naacl.232
%U https://doi.org/10.18653/v1/2024.findings-naacl.232
%P 3656-3677
Markdown (Informal)
[Investigating Acceleration of LLaMA Inference by Enabling Intermediate Layer Decoding via Instruction Tuning with ‘LITE’](https://aclanthology.org/2024.findings-naacl.232) (Varshney et al., Findings 2024)
ACL