@inproceedings{zhang-etal-2026-chain,
title = "Does Chain-of-Thought Reasoning Help Mobile {GUI} Agents? An Empirical Study",
author = "Zhang, Li and
Gao, Longxi and
Xu, Mengwei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.392/",
pages = "7981--7996",
ISBN = "979-8-89176-395-1",
abstract = "Reasoning capabilities have significantly improved the performance of vision-language models (VLMs) in domains such as mathematical problem-solving, coding, and visual question-answering. However, their impact on real-world applications remains unclear. This paper presents a large-scale empirical study on the effectiveness of reasoning-enabled VLMs in mobile GUI agents. We evaluate six pairs of VLMs, including both commercial and open-source lightweight models, by comparing their base and reasoning-enhanced versions across static and interactive benchmarks. Our findings show that reasoning-enabled VLMs generally provide only marginal improvements over their non-reasoning counterparts and can even degrade performance in certain agent configurations. Notably, reasoning and non-reasoning VLMs fail on different sets of tasks, suggesting that reasoning does have an impact, but its benefits and drawbacks counterbalance each other. We attribute these inconsistencies to the limitations of benchmarks and VLMs. Based on the findings, we provide insights for further enhancing mobile GUI agents in terms of benchmarks, VLMs, and their adaptability in dynamically invoking reasoning VLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-chain">
<titleInfo>
<title>Does Chain-of-Thought Reasoning Help Mobile GUI Agents? An Empirical Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longxi</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengwei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Reasoning capabilities have significantly improved the performance of vision-language models (VLMs) in domains such as mathematical problem-solving, coding, and visual question-answering. However, their impact on real-world applications remains unclear. This paper presents a large-scale empirical study on the effectiveness of reasoning-enabled VLMs in mobile GUI agents. We evaluate six pairs of VLMs, including both commercial and open-source lightweight models, by comparing their base and reasoning-enhanced versions across static and interactive benchmarks. Our findings show that reasoning-enabled VLMs generally provide only marginal improvements over their non-reasoning counterparts and can even degrade performance in certain agent configurations. Notably, reasoning and non-reasoning VLMs fail on different sets of tasks, suggesting that reasoning does have an impact, but its benefits and drawbacks counterbalance each other. We attribute these inconsistencies to the limitations of benchmarks and VLMs. Based on the findings, we provide insights for further enhancing mobile GUI agents in terms of benchmarks, VLMs, and their adaptability in dynamically invoking reasoning VLMs.</abstract>
<identifier type="citekey">zhang-etal-2026-chain</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.392/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>7981</start>
<end>7996</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Chain-of-Thought Reasoning Help Mobile GUI Agents? An Empirical Study
%A Zhang, Li
%A Gao, Longxi
%A Xu, Mengwei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-chain
%X Reasoning capabilities have significantly improved the performance of vision-language models (VLMs) in domains such as mathematical problem-solving, coding, and visual question-answering. However, their impact on real-world applications remains unclear. This paper presents a large-scale empirical study on the effectiveness of reasoning-enabled VLMs in mobile GUI agents. We evaluate six pairs of VLMs, including both commercial and open-source lightweight models, by comparing their base and reasoning-enhanced versions across static and interactive benchmarks. Our findings show that reasoning-enabled VLMs generally provide only marginal improvements over their non-reasoning counterparts and can even degrade performance in certain agent configurations. Notably, reasoning and non-reasoning VLMs fail on different sets of tasks, suggesting that reasoning does have an impact, but its benefits and drawbacks counterbalance each other. We attribute these inconsistencies to the limitations of benchmarks and VLMs. Based on the findings, we provide insights for further enhancing mobile GUI agents in terms of benchmarks, VLMs, and their adaptability in dynamically invoking reasoning VLMs.
%U https://aclanthology.org/2026.findings-acl.392/
%P 7981-7996
Markdown (Informal)
[Does Chain-of-Thought Reasoning Help Mobile GUI Agents? An Empirical Study](https://aclanthology.org/2026.findings-acl.392/) (Zhang et al., Findings 2026)
ACL