@inproceedings{kingetsu-etal-2026-look,
title = "Look Before You Leap: A Lookahead Reasoning Quality Gate for Speculative Decoding",
author = "Kingetsu, Hiroaki and
Yokoo, Kaoru and
Fukumizu, Kenji and
Kaul, Manohar",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.367/",
pages = "7831--7847",
ISBN = "979-8-89176-380-7",
abstract = "We present a lookahead quality gate (verifier) for speculative decoding for reasoning or chain-of-thought language models. The gate accepts the longest reliable prefix of each k-token lookahead (block-wise) draft. Unlike token-level likelihood search, which is myopic and often rewards verbosity, or tree-level sampling methods that trade accuracy for latency, our approach works at an intermediate granularity. It uses only the base model{'}s hidden states to compute a geometry-based quality score for each prefix, then accepts the longest prefix whose score exceeds a quantile-calibrated threshold estimated from unlabeled prompts. The method integrates seamlessly with speculative/blockwise decoding and adds minimal runtime overhead, requiring no auxiliary heads, reward models, or finetuning. On math and science benchmarks, it improves accuracy over sampling baselines while achieving $2.6-7.9×$ faster generation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kingetsu-etal-2026-look">
<titleInfo>
<title>Look Before You Leap: A Lookahead Reasoning Quality Gate for Speculative Decoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hiroaki</namePart>
<namePart type="family">Kingetsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaoru</namePart>
<namePart type="family">Yokoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenji</namePart>
<namePart type="family">Fukumizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manohar</namePart>
<namePart type="family">Kaul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>We present a lookahead quality gate (verifier) for speculative decoding for reasoning or chain-of-thought language models. The gate accepts the longest reliable prefix of each k-token lookahead (block-wise) draft. Unlike token-level likelihood search, which is myopic and often rewards verbosity, or tree-level sampling methods that trade accuracy for latency, our approach works at an intermediate granularity. It uses only the base model’s hidden states to compute a geometry-based quality score for each prefix, then accepts the longest prefix whose score exceeds a quantile-calibrated threshold estimated from unlabeled prompts. The method integrates seamlessly with speculative/blockwise decoding and adds minimal runtime overhead, requiring no auxiliary heads, reward models, or finetuning. On math and science benchmarks, it improves accuracy over sampling baselines while achieving 2.6-7.9× faster generation.</abstract>
<identifier type="citekey">kingetsu-etal-2026-look</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.367/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7831</start>
<end>7847</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Look Before You Leap: A Lookahead Reasoning Quality Gate for Speculative Decoding
%A Kingetsu, Hiroaki
%A Yokoo, Kaoru
%A Fukumizu, Kenji
%A Kaul, Manohar
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F kingetsu-etal-2026-look
%X We present a lookahead quality gate (verifier) for speculative decoding for reasoning or chain-of-thought language models. The gate accepts the longest reliable prefix of each k-token lookahead (block-wise) draft. Unlike token-level likelihood search, which is myopic and often rewards verbosity, or tree-level sampling methods that trade accuracy for latency, our approach works at an intermediate granularity. It uses only the base model’s hidden states to compute a geometry-based quality score for each prefix, then accepts the longest prefix whose score exceeds a quantile-calibrated threshold estimated from unlabeled prompts. The method integrates seamlessly with speculative/blockwise decoding and adds minimal runtime overhead, requiring no auxiliary heads, reward models, or finetuning. On math and science benchmarks, it improves accuracy over sampling baselines while achieving 2.6-7.9× faster generation.
%U https://aclanthology.org/2026.eacl-long.367/
%P 7831-7847
Markdown (Informal)
[Look Before You Leap: A Lookahead Reasoning Quality Gate for Speculative Decoding](https://aclanthology.org/2026.eacl-long.367/) (Kingetsu et al., EACL 2026)
ACL