@inproceedings{chiang-etal-2026-shanks,
title = "Shanks: Simultaneous Hearing and Thinking for Spoken Language Models",
author = "Chiang, Cheng-Han and
Wang, Xiaofei and
Li, Linjie and
Lin, Chung-Ching and
Lin, Kevin and
Liu, Shujie and
Wang, Zhendong and
Yang, Zhengyuan and
Lee, Hung-yi and
Wang, Lijuan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.404/",
pages = "8951--8972",
ISBN = "979-8-89176-390-6",
abstract = "Current large language models (LLMs) and spoken language models (SLMs) begin thinking and taking actions only after the user has finished their turn. This prevents the model from interacting with the user during the user{'}s turn and can lead to high response latency when the model is thinking. To address this issue, we draw inspiration from the ``think while listening'' behavior of humans. In this paper, we propose SHANKS, a general inference framework that enables SLMs to generate unspoken chain-of-thought reasoning while listening to user input. SHANKS streams input speech in fixed-duration chunks and, as soon as a chunk is received, generates unspoken reasoning based on all previous speech and reasoning, while the user continues speaking. SHANKS uses unspoken reasoning to determine whether to interrupt the user and make tool calls to complete the task. We demonstrate that SHANKS enhances real-time user{--}SLM interaction in two scenarios: (1) SHANKS can listen to the user{'}s speech and interrupt when the user makes a mistake. (2) In a tool-augmented dialogue scenario, SHANKS can complete 56.9{\%} of the tool calls before the user ends their turn. Overall, SHANKS is a step toward models that keep thinking throughout the conversation, not only after a turn ends. Demos can be found on the project page: https://d223302.github.io/SHANKS/."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chiang-etal-2026-shanks">
<titleInfo>
<title>Shanks: Simultaneous Hearing and Thinking for Spoken Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cheng-Han</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaofei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chung-Ching</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shujie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhendong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengyuan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-yi</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lijuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Current large language models (LLMs) and spoken language models (SLMs) begin thinking and taking actions only after the user has finished their turn. This prevents the model from interacting with the user during the user’s turn and can lead to high response latency when the model is thinking. To address this issue, we draw inspiration from the “think while listening” behavior of humans. In this paper, we propose SHANKS, a general inference framework that enables SLMs to generate unspoken chain-of-thought reasoning while listening to user input. SHANKS streams input speech in fixed-duration chunks and, as soon as a chunk is received, generates unspoken reasoning based on all previous speech and reasoning, while the user continues speaking. SHANKS uses unspoken reasoning to determine whether to interrupt the user and make tool calls to complete the task. We demonstrate that SHANKS enhances real-time user–SLM interaction in two scenarios: (1) SHANKS can listen to the user’s speech and interrupt when the user makes a mistake. (2) In a tool-augmented dialogue scenario, SHANKS can complete 56.9% of the tool calls before the user ends their turn. Overall, SHANKS is a step toward models that keep thinking throughout the conversation, not only after a turn ends. Demos can be found on the project page: https://d223302.github.io/SHANKS/.</abstract>
<identifier type="citekey">chiang-etal-2026-shanks</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.404/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>8951</start>
<end>8972</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shanks: Simultaneous Hearing and Thinking for Spoken Language Models
%A Chiang, Cheng-Han
%A Wang, Xiaofei
%A Li, Linjie
%A Lin, Chung-Ching
%A Lin, Kevin
%A Liu, Shujie
%A Wang, Zhendong
%A Yang, Zhengyuan
%A Lee, Hung-yi
%A Wang, Lijuan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F chiang-etal-2026-shanks
%X Current large language models (LLMs) and spoken language models (SLMs) begin thinking and taking actions only after the user has finished their turn. This prevents the model from interacting with the user during the user’s turn and can lead to high response latency when the model is thinking. To address this issue, we draw inspiration from the “think while listening” behavior of humans. In this paper, we propose SHANKS, a general inference framework that enables SLMs to generate unspoken chain-of-thought reasoning while listening to user input. SHANKS streams input speech in fixed-duration chunks and, as soon as a chunk is received, generates unspoken reasoning based on all previous speech and reasoning, while the user continues speaking. SHANKS uses unspoken reasoning to determine whether to interrupt the user and make tool calls to complete the task. We demonstrate that SHANKS enhances real-time user–SLM interaction in two scenarios: (1) SHANKS can listen to the user’s speech and interrupt when the user makes a mistake. (2) In a tool-augmented dialogue scenario, SHANKS can complete 56.9% of the tool calls before the user ends their turn. Overall, SHANKS is a step toward models that keep thinking throughout the conversation, not only after a turn ends. Demos can be found on the project page: https://d223302.github.io/SHANKS/.
%U https://aclanthology.org/2026.acl-long.404/
%P 8951-8972
Markdown (Informal)
[Shanks: Simultaneous Hearing and Thinking for Spoken Language Models](https://aclanthology.org/2026.acl-long.404/) (Chiang et al., ACL 2026)
ACL
- Cheng-Han Chiang, Xiaofei Wang, Linjie Li, Chung-Ching Lin, Kevin Lin, Shujie Liu, Zhendong Wang, Zhengyuan Yang, Hung-yi Lee, and Lijuan Wang. 2026. Shanks: Simultaneous Hearing and Thinking for Spoken Language Models. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8951–8972, San Diego, California, United States. Association for Computational Linguistics.