@inproceedings{she-etal-2025-token,
title = "Token Level Routing Inference System for Edge Devices",
author = "She, Jianshu and
Zheng, Wenhao and
Liu, Zhengzhong and
Wang, Hongyi and
Xing, Eric P. and
Yao, Huaxiu and
Ho, Qirong",
editor = "Mishra, Pushkar and
Muresan, Smaranda and
Yu, Tao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-demo.16/",
doi = "10.18653/v1/2025.acl-demo.16",
pages = "159--166",
ISBN = "979-8-89176-253-4",
abstract = "The computational complexity of large language model (LLM) inference significantly constrains their deployment efficiency on edge devices. In contrast, small language models offer faster decoding and lower resource consumption but often suffer from degraded response quality and heightened susceptibility to hallucinations. To address this trade-off, collaborative decoding, in which a large model assists in generating critical tokens, has emerged as a promising solution. This paradigm leverages the strengths of both model types by enabling high-quality inference through selective intervention of the large model, while maintaining the speed and efficiency of the smaller model. In this work, we present a novel collaborative decoding inference system that allows small models to perform on-device inference while selectively consulting a cloud-based large model for critical token generation. Remarkably, the system achieves a 60{\%} performance gain on CommonsenseQA using only a 0.5B model on an M1 MacBook, with under 7{\%} of tokens generation uploaded to the large model in the cloud."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="she-etal-2025-token">
<titleInfo>
<title>Token Level Routing Inference System for Edge Devices</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianshu</namePart>
<namePart type="family">She</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengzhong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huaxiu</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qirong</namePart>
<namePart type="family">Ho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushkar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-253-4</identifier>
</relatedItem>
<abstract>The computational complexity of large language model (LLM) inference significantly constrains their deployment efficiency on edge devices. In contrast, small language models offer faster decoding and lower resource consumption but often suffer from degraded response quality and heightened susceptibility to hallucinations. To address this trade-off, collaborative decoding, in which a large model assists in generating critical tokens, has emerged as a promising solution. This paradigm leverages the strengths of both model types by enabling high-quality inference through selective intervention of the large model, while maintaining the speed and efficiency of the smaller model. In this work, we present a novel collaborative decoding inference system that allows small models to perform on-device inference while selectively consulting a cloud-based large model for critical token generation. Remarkably, the system achieves a 60% performance gain on CommonsenseQA using only a 0.5B model on an M1 MacBook, with under 7% of tokens generation uploaded to the large model in the cloud.</abstract>
<identifier type="citekey">she-etal-2025-token</identifier>
<identifier type="doi">10.18653/v1/2025.acl-demo.16</identifier>
<location>
<url>https://aclanthology.org/2025.acl-demo.16/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>159</start>
<end>166</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Token Level Routing Inference System for Edge Devices
%A She, Jianshu
%A Zheng, Wenhao
%A Liu, Zhengzhong
%A Wang, Hongyi
%A Xing, Eric P.
%A Yao, Huaxiu
%A Ho, Qirong
%Y Mishra, Pushkar
%Y Muresan, Smaranda
%Y Yu, Tao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-253-4
%F she-etal-2025-token
%X The computational complexity of large language model (LLM) inference significantly constrains their deployment efficiency on edge devices. In contrast, small language models offer faster decoding and lower resource consumption but often suffer from degraded response quality and heightened susceptibility to hallucinations. To address this trade-off, collaborative decoding, in which a large model assists in generating critical tokens, has emerged as a promising solution. This paradigm leverages the strengths of both model types by enabling high-quality inference through selective intervention of the large model, while maintaining the speed and efficiency of the smaller model. In this work, we present a novel collaborative decoding inference system that allows small models to perform on-device inference while selectively consulting a cloud-based large model for critical token generation. Remarkably, the system achieves a 60% performance gain on CommonsenseQA using only a 0.5B model on an M1 MacBook, with under 7% of tokens generation uploaded to the large model in the cloud.
%R 10.18653/v1/2025.acl-demo.16
%U https://aclanthology.org/2025.acl-demo.16/
%U https://doi.org/10.18653/v1/2025.acl-demo.16
%P 159-166
Markdown (Informal)
[Token Level Routing Inference System for Edge Devices](https://aclanthology.org/2025.acl-demo.16/) (She et al., ACL 2025)
ACL
- Jianshu She, Wenhao Zheng, Zhengzhong Liu, Hongyi Wang, Eric P. Xing, Huaxiu Yao, and Qirong Ho. 2025. Token Level Routing Inference System for Edge Devices. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), pages 159–166, Vienna, Austria. Association for Computational Linguistics.