@inproceedings{kim-etal-2025-fractalllm,
title = "{F}ractal{LLM}: Lossless Self-Speculative Decoding with Layer Embedded Self-Compression",
author = "Kim, Juhyeong and
Yu, Sangyeon and
Kim, Gyunyeop and
Kang, Sangwoo",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1286/",
pages = "23666--23673",
ISBN = "979-8-89176-335-7",
abstract = "Autoregressive decoding in large language models (LLMs) necessitates a full forward pass for each generated token, significantly increasing inference latency. To address this limitation, we propose Fractal-LLM, a lossless self-speculative decoding method that embeds a compressed model within selected decoder layers of the original model. Specifically, our approach generates multiple draft tokens in parallel by injecting compressed layers into selected decoder layers. These draft tokens are subsequently verified through a single forward pass of the original model, ensuring the final outputs exactly match those produced by the original model. Experimental results across diverse benchmarks{---}including GSM8K, XSUM, CNN/DailyMail, and HumanEval{---}demonstrate that our method achieves substantial inference speed-ups (up to 2.47{\texttimes}) compared to standard autoregressive decoding, without requiring any additional training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2025-fractalllm">
<titleInfo>
<title>FractalLLM: Lossless Self-Speculative Decoding with Layer Embedded Self-Compression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juhyeong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangyeon</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gyunyeop</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangwoo</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Autoregressive decoding in large language models (LLMs) necessitates a full forward pass for each generated token, significantly increasing inference latency. To address this limitation, we propose Fractal-LLM, a lossless self-speculative decoding method that embeds a compressed model within selected decoder layers of the original model. Specifically, our approach generates multiple draft tokens in parallel by injecting compressed layers into selected decoder layers. These draft tokens are subsequently verified through a single forward pass of the original model, ensuring the final outputs exactly match those produced by the original model. Experimental results across diverse benchmarks—including GSM8K, XSUM, CNN/DailyMail, and HumanEval—demonstrate that our method achieves substantial inference speed-ups (up to 2.47×) compared to standard autoregressive decoding, without requiring any additional training.</abstract>
<identifier type="citekey">kim-etal-2025-fractalllm</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1286/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>23666</start>
<end>23673</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FractalLLM: Lossless Self-Speculative Decoding with Layer Embedded Self-Compression
%A Kim, Juhyeong
%A Yu, Sangyeon
%A Kim, Gyunyeop
%A Kang, Sangwoo
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F kim-etal-2025-fractalllm
%X Autoregressive decoding in large language models (LLMs) necessitates a full forward pass for each generated token, significantly increasing inference latency. To address this limitation, we propose Fractal-LLM, a lossless self-speculative decoding method that embeds a compressed model within selected decoder layers of the original model. Specifically, our approach generates multiple draft tokens in parallel by injecting compressed layers into selected decoder layers. These draft tokens are subsequently verified through a single forward pass of the original model, ensuring the final outputs exactly match those produced by the original model. Experimental results across diverse benchmarks—including GSM8K, XSUM, CNN/DailyMail, and HumanEval—demonstrate that our method achieves substantial inference speed-ups (up to 2.47×) compared to standard autoregressive decoding, without requiring any additional training.
%U https://aclanthology.org/2025.findings-emnlp.1286/
%P 23666-23673
Markdown (Informal)
[FractalLLM: Lossless Self-Speculative Decoding with Layer Embedded Self-Compression](https://aclanthology.org/2025.findings-emnlp.1286/) (Kim et al., Findings 2025)
ACL