@inproceedings{kapadia-etal-2026-leap,
title = "{LEAP}: Layer-wise Exit-Aware Pretraining for Efficient Transformer Inference",
author = "Kapadia, Shashank and
Mishra, Deep Narayan and
Alugubelli, Sujal Reddy and
Wang, Haoan and
Vabbilisetty, Saipraveen and
Bhatia, Rishi and
Sharma, Anupriya",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.52/",
pages = "761--774",
ISBN = "979-8-89176-394-4",
abstract = "Layer-aligned distillation and convergence-based early exit represent two predominant computational efficiency paradigms for transformer inference; yet we establish that they exhibit fundamental incompatibility under standard deployment conditions for convergence-based early exit. Distillation objectives that align intermediate student layers to teacher representations suppress the representational convergence that early-exit mechanisms exploit, rendering such mechanisms ineffective on distilled models.We introduce LEAP (Layer-wise Exit-Aware Pretraining), an auxiliary training objective that reconciles this incompatibility. LEAP requires no architectural modifications; it augments standard distillation with a single constraint ensuring intermediate layers approximate final-layer representations. LEAP-MiniLM achieves $1.61\times$ measured wall-clock speedup (batch $= 1$, NVIDIA L4) at $\theta = 0.95$, with 91.9{\%} of samples exiting by layer 7 and $1.80\times$ theoretical layer reduction, where standard distilled models achieve zero effective speedup. We validate across sentence similarity (STS-B: $0.760 \pm 0.006$) and retrieval benchmarks (BEIR), providing operational guidance including latency measurements, decision thresholds, and deployment criteria."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kapadia-etal-2026-leap">
<titleInfo>
<title>LEAP: Layer-wise Exit-Aware Pretraining for Efficient Transformer Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shashank</namePart>
<namePart type="family">Kapadia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deep</namePart>
<namePart type="given">Narayan</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujal</namePart>
<namePart type="given">Reddy</namePart>
<namePart type="family">Alugubelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saipraveen</namePart>
<namePart type="family">Vabbilisetty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishi</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anupriya</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Layer-aligned distillation and convergence-based early exit represent two predominant computational efficiency paradigms for transformer inference; yet we establish that they exhibit fundamental incompatibility under standard deployment conditions for convergence-based early exit. Distillation objectives that align intermediate student layers to teacher representations suppress the representational convergence that early-exit mechanisms exploit, rendering such mechanisms ineffective on distilled models.We introduce LEAP (Layer-wise Exit-Aware Pretraining), an auxiliary training objective that reconciles this incompatibility. LEAP requires no architectural modifications; it augments standard distillation with a single constraint ensuring intermediate layers approximate final-layer representations. LEAP-MiniLM achieves 1.61\times measured wall-clock speedup (batch = 1, NVIDIA L4) at θ = 0.95, with 91.9% of samples exiting by layer 7 and 1.80\times theoretical layer reduction, where standard distilled models achieve zero effective speedup. We validate across sentence similarity (STS-B: 0.760 \pm 0.006) and retrieval benchmarks (BEIR), providing operational guidance including latency measurements, decision thresholds, and deployment criteria.</abstract>
<identifier type="citekey">kapadia-etal-2026-leap</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.52/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>761</start>
<end>774</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LEAP: Layer-wise Exit-Aware Pretraining for Efficient Transformer Inference
%A Kapadia, Shashank
%A Mishra, Deep Narayan
%A Alugubelli, Sujal Reddy
%A Wang, Haoan
%A Vabbilisetty, Saipraveen
%A Bhatia, Rishi
%A Sharma, Anupriya
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F kapadia-etal-2026-leap
%X Layer-aligned distillation and convergence-based early exit represent two predominant computational efficiency paradigms for transformer inference; yet we establish that they exhibit fundamental incompatibility under standard deployment conditions for convergence-based early exit. Distillation objectives that align intermediate student layers to teacher representations suppress the representational convergence that early-exit mechanisms exploit, rendering such mechanisms ineffective on distilled models.We introduce LEAP (Layer-wise Exit-Aware Pretraining), an auxiliary training objective that reconciles this incompatibility. LEAP requires no architectural modifications; it augments standard distillation with a single constraint ensuring intermediate layers approximate final-layer representations. LEAP-MiniLM achieves 1.61\times measured wall-clock speedup (batch = 1, NVIDIA L4) at θ = 0.95, with 91.9% of samples exiting by layer 7 and 1.80\times theoretical layer reduction, where standard distilled models achieve zero effective speedup. We validate across sentence similarity (STS-B: 0.760 \pm 0.006) and retrieval benchmarks (BEIR), providing operational guidance including latency measurements, decision thresholds, and deployment criteria.
%U https://aclanthology.org/2026.acl-industry.52/
%P 761-774
Markdown (Informal)
[LEAP: Layer-wise Exit-Aware Pretraining for Efficient Transformer Inference](https://aclanthology.org/2026.acl-industry.52/) (Kapadia et al., ACL 2026)
ACL
- Shashank Kapadia, Deep Narayan Mishra, Sujal Reddy Alugubelli, Haoan Wang, Saipraveen Vabbilisetty, Rishi Bhatia, and Anupriya Sharma. 2026. LEAP: Layer-wise Exit-Aware Pretraining for Efficient Transformer Inference. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 761–774, San Diego, California, USA. Association for Computational Linguistics.