@inproceedings{hsiung-etal-2026-pgga,
title = "{PGGA}: A Plan-Grounded {GUI} Agent for Automated Device Support",
author = "Hsiung, Lei and
Chen, Zhiyu and
Kim, Seonhoon and
Liu, Qun",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.alvr-main.9/",
pages = "105--114",
ISBN = "979-8-89176-398-2",
abstract = "Current GUI agents struggle with multi-step digital device support. We investigate whether this failure is partly caused by a procedural knowledge deficit: agents often rely on zero-shot visual exploration instead of executing verified instructions. To address this, we introduce the Plan-Grounded GUI Agent (PGGA), framing interface navigation as a knowledge-execution problem by conditioning low-level actions on step-by-step text plans. Evaluated on our focused Device-Support Interaction Benchmark (DSIB), results reveal a sharp gap between knowing which operation to perform and grounding that operation on the screen: GTA1-7B reaches 99.59{\%} Operation Accuracy with expert plans, but only 82.99{\%} Element Accuracy and 45.61{\%} Task Success Rate; without plans, its Task Success Rate is 0.00{\%}. Our fine-tuned 2B-parameter PGGA achieves 54.39{\%} Task Success Rate and 91.28{\%} Element Accuracy when guided by expert plans, suggesting that explicit procedural grounding can substantially improve GUI execution when high-quality plans are available. Project Page: https://hsiung.cc/PGGA/"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hsiung-etal-2026-pgga">
<titleInfo>
<title>PGGA: A Plan-Grounded GUI Agent for Automated Device Support</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Hsiung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seonhoon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qianqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alane</namePart>
<namePart type="family">Suhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="given">Eric</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-398-2</identifier>
</relatedItem>
<abstract>Current GUI agents struggle with multi-step digital device support. We investigate whether this failure is partly caused by a procedural knowledge deficit: agents often rely on zero-shot visual exploration instead of executing verified instructions. To address this, we introduce the Plan-Grounded GUI Agent (PGGA), framing interface navigation as a knowledge-execution problem by conditioning low-level actions on step-by-step text plans. Evaluated on our focused Device-Support Interaction Benchmark (DSIB), results reveal a sharp gap between knowing which operation to perform and grounding that operation on the screen: GTA1-7B reaches 99.59% Operation Accuracy with expert plans, but only 82.99% Element Accuracy and 45.61% Task Success Rate; without plans, its Task Success Rate is 0.00%. Our fine-tuned 2B-parameter PGGA achieves 54.39% Task Success Rate and 91.28% Element Accuracy when guided by expert plans, suggesting that explicit procedural grounding can substantially improve GUI execution when high-quality plans are available. Project Page: https://hsiung.cc/PGGA/</abstract>
<identifier type="citekey">hsiung-etal-2026-pgga</identifier>
<location>
<url>https://aclanthology.org/2026.alvr-main.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>105</start>
<end>114</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PGGA: A Plan-Grounded GUI Agent for Automated Device Support
%A Hsiung, Lei
%A Chen, Zhiyu
%A Kim, Seonhoon
%A Liu, Qun
%Y Yan, Qianqi
%Y Montariol, Syrielle
%Y Fan, Yue
%Y Gu, Jing
%Y Pan, Jiayi
%Y Li, Manling
%Y Kordjamshidi, Parisa
%Y Suhr, Alane
%Y Wang, Xin Eric
%S Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-398-2
%F hsiung-etal-2026-pgga
%X Current GUI agents struggle with multi-step digital device support. We investigate whether this failure is partly caused by a procedural knowledge deficit: agents often rely on zero-shot visual exploration instead of executing verified instructions. To address this, we introduce the Plan-Grounded GUI Agent (PGGA), framing interface navigation as a knowledge-execution problem by conditioning low-level actions on step-by-step text plans. Evaluated on our focused Device-Support Interaction Benchmark (DSIB), results reveal a sharp gap between knowing which operation to perform and grounding that operation on the screen: GTA1-7B reaches 99.59% Operation Accuracy with expert plans, but only 82.99% Element Accuracy and 45.61% Task Success Rate; without plans, its Task Success Rate is 0.00%. Our fine-tuned 2B-parameter PGGA achieves 54.39% Task Success Rate and 91.28% Element Accuracy when guided by expert plans, suggesting that explicit procedural grounding can substantially improve GUI execution when high-quality plans are available. Project Page: https://hsiung.cc/PGGA/
%U https://aclanthology.org/2026.alvr-main.9/
%P 105-114
Markdown (Informal)
[PGGA: A Plan-Grounded GUI Agent for Automated Device Support](https://aclanthology.org/2026.alvr-main.9/) (Hsiung et al., ALVR 2026)
ACL