@inproceedings{gupta-etal-2026-speculative,
title = "Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks",
author = "Gupta, Aditi and
Mishra, Neel and
Trivedi, Kushagra and
Kumar, Pawan",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.33/",
pages = "355--363",
ISBN = "979-8-89176-423-1",
abstract = "How should we evaluate generation systems that combine autoregressive (AR) and diffusion decoding?We study this question through *Speculative Refinement* (SpecRef), a training-free hybrid method that warm-starts a masked diffusion language model from an AR draft using entropy-guided selective masking.Evaluating SpecRef across six benchmarks (HumanEval, MBPP, GSM8K, BBH, ARC-Challenge, HellaSwag) with three distinct evaluation protocols (execution-based pass@1, exact-match, log-likelihood scoring), we surface several findings relevant beyond our specific system:(1) code benchmarks conflate structural discovery with logical correctness: providing a syntactic scaffold lifts accuracy from near zero to over 20{\%} without changing the model, indicating that much of the baseline failure is structural;(2) a *refinement tension* phenomenon where multi-stage correction degrades already-correct tokens, exposing benchmark saturation ceilings invisible to single-model evaluation;(3) log-likelihood and generative evaluation produce different model rankings for the same model pair, suggesting they measure different capabilities;(4) standard Python post-processing silently breaks code evaluation for non-AR generators.These observations apply to any multi-stage or non-autoregressive generation pipeline and point toward more diagnostic evaluation practices."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gupta-etal-2026-speculative">
<titleInfo>
<title>Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aditi</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neel</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kushagra</namePart>
<namePart type="family">Trivedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawan</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>How should we evaluate generation systems that combine autoregressive (AR) and diffusion decoding?We study this question through *Speculative Refinement* (SpecRef), a training-free hybrid method that warm-starts a masked diffusion language model from an AR draft using entropy-guided selective masking.Evaluating SpecRef across six benchmarks (HumanEval, MBPP, GSM8K, BBH, ARC-Challenge, HellaSwag) with three distinct evaluation protocols (execution-based pass@1, exact-match, log-likelihood scoring), we surface several findings relevant beyond our specific system:(1) code benchmarks conflate structural discovery with logical correctness: providing a syntactic scaffold lifts accuracy from near zero to over 20% without changing the model, indicating that much of the baseline failure is structural;(2) a *refinement tension* phenomenon where multi-stage correction degrades already-correct tokens, exposing benchmark saturation ceilings invisible to single-model evaluation;(3) log-likelihood and generative evaluation produce different model rankings for the same model pair, suggesting they measure different capabilities;(4) standard Python post-processing silently breaks code evaluation for non-AR generators.These observations apply to any multi-stage or non-autoregressive generation pipeline and point toward more diagnostic evaluation practices.</abstract>
<identifier type="citekey">gupta-etal-2026-speculative</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.33/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>355</start>
<end>363</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks
%A Gupta, Aditi
%A Mishra, Neel
%A Trivedi, Kushagra
%A Kumar, Pawan
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F gupta-etal-2026-speculative
%X How should we evaluate generation systems that combine autoregressive (AR) and diffusion decoding?We study this question through *Speculative Refinement* (SpecRef), a training-free hybrid method that warm-starts a masked diffusion language model from an AR draft using entropy-guided selective masking.Evaluating SpecRef across six benchmarks (HumanEval, MBPP, GSM8K, BBH, ARC-Challenge, HellaSwag) with three distinct evaluation protocols (execution-based pass@1, exact-match, log-likelihood scoring), we surface several findings relevant beyond our specific system:(1) code benchmarks conflate structural discovery with logical correctness: providing a syntactic scaffold lifts accuracy from near zero to over 20% without changing the model, indicating that much of the baseline failure is structural;(2) a *refinement tension* phenomenon where multi-stage correction degrades already-correct tokens, exposing benchmark saturation ceilings invisible to single-model evaluation;(3) log-likelihood and generative evaluation produce different model rankings for the same model pair, suggesting they measure different capabilities;(4) standard Python post-processing silently breaks code evaluation for non-AR generators.These observations apply to any multi-stage or non-autoregressive generation pipeline and point toward more diagnostic evaluation practices.
%U https://aclanthology.org/2026.gem-main.33/
%P 355-363
Markdown (Informal)
[Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks](https://aclanthology.org/2026.gem-main.33/) (Gupta et al., GEM 2026)
ACL