@inproceedings{chen-etal-2025-pre3,
title = "Pre$^3$: Enabling Deterministic Pushdown Automata for Faster Structured {LLM} Generation",
author = "Chen, Junyi and
Bai, Shihao and
Wang, Zaijun and
Wu, Siyu and
Du, Chuheng and
Yang, Hailong and
Gong, Ruihao and
Liu, Shengzhong and
Wu, Fan and
Chen, Guihai",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.551/",
doi = "10.18653/v1/2025.acl-long.551",
pages = "11253--11267",
ISBN = "979-8-89176-251-0",
abstract = "Extensive LLM applications demand efficient structured generations, particularly for LR(1) grammars, to produce outputs in specified formats (e.g., JSON). Existing methods primarily parse LR(1) grammars into a pushdown automaton (PDA), leading to runtime execution overhead for context-dependent token processing, especially inefficient under large inference batches.To address these issues, we propose $\text{Pre}^3$ that exploits deterministic pushdown automata (DPDA) to optimize the constrained LLM decoding efficiency.First, by **pre**computing **pre**fix-conditioned edges during the **pre**processing, $\text{Pre}^3$ enables ahead-of-time edge analysis and thus makes parallel transition processing possible.Futher, leveraging the prefix-conditioned edges, $\text{Pre}^3$ introduces a novel approach that transforms LR(1) transition graphs into DPDA, eliminating the need for runtime path exploration and achieving edge transitions with minimal overhead.$\text{Pre}^3$ can be seamlessly integrated into standard LLM inference frameworks, improving time per output token (TPOT) by up to 40{\%} and throughput by up to 36{\%} in our experiments. Our code is available at https://github.com/ModelTC/lightllm."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-pre3">
<titleInfo>
<title>Pre³: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shihao</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaijun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siyu</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuheng</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hailong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruihao</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengzhong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guihai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Extensive LLM applications demand efficient structured generations, particularly for LR(1) grammars, to produce outputs in specified formats (e.g., JSON). Existing methods primarily parse LR(1) grammars into a pushdown automaton (PDA), leading to runtime execution overhead for context-dependent token processing, especially inefficient under large inference batches.To address these issues, we propose \textPre³ that exploits deterministic pushdown automata (DPDA) to optimize the constrained LLM decoding efficiency.First, by **pre**computing **pre**fix-conditioned edges during the **pre**processing, \textPre³ enables ahead-of-time edge analysis and thus makes parallel transition processing possible.Futher, leveraging the prefix-conditioned edges, \textPre³ introduces a novel approach that transforms LR(1) transition graphs into DPDA, eliminating the need for runtime path exploration and achieving edge transitions with minimal overhead.\textPre³ can be seamlessly integrated into standard LLM inference frameworks, improving time per output token (TPOT) by up to 40% and throughput by up to 36% in our experiments. Our code is available at https://github.com/ModelTC/lightllm.</abstract>
<identifier type="citekey">chen-etal-2025-pre3</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.551</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.551/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11253</start>
<end>11267</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pre³: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation
%A Chen, Junyi
%A Bai, Shihao
%A Wang, Zaijun
%A Wu, Siyu
%A Du, Chuheng
%A Yang, Hailong
%A Gong, Ruihao
%A Liu, Shengzhong
%A Wu, Fan
%A Chen, Guihai
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chen-etal-2025-pre3
%X Extensive LLM applications demand efficient structured generations, particularly for LR(1) grammars, to produce outputs in specified formats (e.g., JSON). Existing methods primarily parse LR(1) grammars into a pushdown automaton (PDA), leading to runtime execution overhead for context-dependent token processing, especially inefficient under large inference batches.To address these issues, we propose \textPre³ that exploits deterministic pushdown automata (DPDA) to optimize the constrained LLM decoding efficiency.First, by **pre**computing **pre**fix-conditioned edges during the **pre**processing, \textPre³ enables ahead-of-time edge analysis and thus makes parallel transition processing possible.Futher, leveraging the prefix-conditioned edges, \textPre³ introduces a novel approach that transforms LR(1) transition graphs into DPDA, eliminating the need for runtime path exploration and achieving edge transitions with minimal overhead.\textPre³ can be seamlessly integrated into standard LLM inference frameworks, improving time per output token (TPOT) by up to 40% and throughput by up to 36% in our experiments. Our code is available at https://github.com/ModelTC/lightllm.
%R 10.18653/v1/2025.acl-long.551
%U https://aclanthology.org/2025.acl-long.551/
%U https://doi.org/10.18653/v1/2025.acl-long.551
%P 11253-11267
Markdown (Informal)
[Pre3: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation](https://aclanthology.org/2025.acl-long.551/) (Chen et al., ACL 2025)
ACL
- Junyi Chen, Shihao Bai, Zaijun Wang, Siyu Wu, Chuheng Du, Hailong Yang, Ruihao Gong, Shengzhong Liu, Fan Wu, and Guihai Chen. 2025. Pre3: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 11253–11267, Vienna, Austria. Association for Computational Linguistics.