@inproceedings{hoscilowicz-janicki-2025-clickagent,
title = "{C}lick{A}gent: Enhancing {UI} Location Capabilities of Autonomous Agents",
author = "Hoscilowicz, Jakub and
Janicki, Artur",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.38/",
pages = "471--476",
abstract = "With the growing reliance on digital devices with graphical user interfaces (GUIs) like computers and smartphones, the demand for smart voice assistants has grown significantly. While multimodal large language models (MLLM) like GPT-4V excel in many areas, they struggle with GUI interactions, limiting their effectiveness in automating everyday tasks. In this work, we introduce ClickAgent, a novel framework for building autonomous agents. ClickAgent combines MLLM-driven reasoning and action planning with a separate UI location model that identifies relevant UI elements on the screen. This approach addresses a key limitation of current MLLMs: their inability to accurately locate UI elements. Evaluations conducted using both an Android emulator and a real smartphone show that ClickAgent outperforms other autonomous agents (DigiRL, CogAgent, AppAgent) on the AITW benchmark."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hoscilowicz-janicki-2025-clickagent">
<titleInfo>
<title>ClickAgent: Enhancing UI Location Capabilities of Autonomous Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Hoscilowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Janicki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the growing reliance on digital devices with graphical user interfaces (GUIs) like computers and smartphones, the demand for smart voice assistants has grown significantly. While multimodal large language models (MLLM) like GPT-4V excel in many areas, they struggle with GUI interactions, limiting their effectiveness in automating everyday tasks. In this work, we introduce ClickAgent, a novel framework for building autonomous agents. ClickAgent combines MLLM-driven reasoning and action planning with a separate UI location model that identifies relevant UI elements on the screen. This approach addresses a key limitation of current MLLMs: their inability to accurately locate UI elements. Evaluations conducted using both an Android emulator and a real smartphone show that ClickAgent outperforms other autonomous agents (DigiRL, CogAgent, AppAgent) on the AITW benchmark.</abstract>
<identifier type="citekey">hoscilowicz-janicki-2025-clickagent</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.38/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>471</start>
<end>476</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ClickAgent: Enhancing UI Location Capabilities of Autonomous Agents
%A Hoscilowicz, Jakub
%A Janicki, Artur
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F hoscilowicz-janicki-2025-clickagent
%X With the growing reliance on digital devices with graphical user interfaces (GUIs) like computers and smartphones, the demand for smart voice assistants has grown significantly. While multimodal large language models (MLLM) like GPT-4V excel in many areas, they struggle with GUI interactions, limiting their effectiveness in automating everyday tasks. In this work, we introduce ClickAgent, a novel framework for building autonomous agents. ClickAgent combines MLLM-driven reasoning and action planning with a separate UI location model that identifies relevant UI elements on the screen. This approach addresses a key limitation of current MLLMs: their inability to accurately locate UI elements. Evaluations conducted using both an Android emulator and a real smartphone show that ClickAgent outperforms other autonomous agents (DigiRL, CogAgent, AppAgent) on the AITW benchmark.
%U https://aclanthology.org/2025.sigdial-1.38/
%P 471-476
Markdown (Informal)
[ClickAgent: Enhancing UI Location Capabilities of Autonomous Agents](https://aclanthology.org/2025.sigdial-1.38/) (Hoscilowicz & Janicki, SIGDIAL 2025)
ACL