@inproceedings{lu-etal-2026-eva,
title = "{EVA}: Evolving Semantic Adversaries for Red-Teaming {GUI} Agents Against Environmental Injection Attacks",
author = "Lu, Yijie and
Zhao, Manman and
Ju, Tianjie and
Yan, Zihe and
Ma, Xinbei and
Guo, Yuan and
Ding, Daizong and
Liu, Gongshen and
Zhang, Zhuosheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1230/",
pages = "24597--24615",
ISBN = "979-8-89176-395-1",
abstract = "Autonomous GUI agents are inherently vulnerable to Environmental Injection Attacks (EIAs). However, existing red-teaming methods face a trade-off between requiring target-specific knowledge and incurring prohibitive computational costs. More fundamentally, a key question remains: $\textit{what factors determine attack success?}$ To answer this, we first analyze two dimensions: visual appearance (e.g., position, size, color) and semantic content. We find that semantic content dominates, while visual variations have negligible impact. Leveraging this insight, we introduce EVA, a framework that evolves payloads exclusively on the semantic dimension via a discovery-deployment pipeline. Experiments demonstrate that EVA significantly outperforms baselines, achieving 59{\%} to 85{\%} average Attack Success Rate (ASR) while evolving benign seeds into successful attacks within 1.18 to 1.71 iterations. This rapid convergence suggests a dense semantic attack space within the model{'}s latent space. Whenever an input falls into this space, the agent becomes inherently vulnerable, exposing a fundamental alignment flaw in current multimodal representations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lu-etal-2026-eva">
<titleInfo>
<title>EVA: Evolving Semantic Adversaries for Red-Teaming GUI Agents Against Environmental Injection Attacks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yijie</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manman</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianjie</namePart>
<namePart type="family">Ju</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihe</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinbei</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daizong</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gongshen</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuosheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Autonomous GUI agents are inherently vulnerable to Environmental Injection Attacks (EIAs). However, existing red-teaming methods face a trade-off between requiring target-specific knowledge and incurring prohibitive computational costs. More fundamentally, a key question remains: what factors determine attack success? To answer this, we first analyze two dimensions: visual appearance (e.g., position, size, color) and semantic content. We find that semantic content dominates, while visual variations have negligible impact. Leveraging this insight, we introduce EVA, a framework that evolves payloads exclusively on the semantic dimension via a discovery-deployment pipeline. Experiments demonstrate that EVA significantly outperforms baselines, achieving 59% to 85% average Attack Success Rate (ASR) while evolving benign seeds into successful attacks within 1.18 to 1.71 iterations. This rapid convergence suggests a dense semantic attack space within the model’s latent space. Whenever an input falls into this space, the agent becomes inherently vulnerable, exposing a fundamental alignment flaw in current multimodal representations.</abstract>
<identifier type="citekey">lu-etal-2026-eva</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1230/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>24597</start>
<end>24615</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EVA: Evolving Semantic Adversaries for Red-Teaming GUI Agents Against Environmental Injection Attacks
%A Lu, Yijie
%A Zhao, Manman
%A Ju, Tianjie
%A Yan, Zihe
%A Ma, Xinbei
%A Guo, Yuan
%A Ding, Daizong
%A Liu, Gongshen
%A Zhang, Zhuosheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F lu-etal-2026-eva
%X Autonomous GUI agents are inherently vulnerable to Environmental Injection Attacks (EIAs). However, existing red-teaming methods face a trade-off between requiring target-specific knowledge and incurring prohibitive computational costs. More fundamentally, a key question remains: what factors determine attack success? To answer this, we first analyze two dimensions: visual appearance (e.g., position, size, color) and semantic content. We find that semantic content dominates, while visual variations have negligible impact. Leveraging this insight, we introduce EVA, a framework that evolves payloads exclusively on the semantic dimension via a discovery-deployment pipeline. Experiments demonstrate that EVA significantly outperforms baselines, achieving 59% to 85% average Attack Success Rate (ASR) while evolving benign seeds into successful attacks within 1.18 to 1.71 iterations. This rapid convergence suggests a dense semantic attack space within the model’s latent space. Whenever an input falls into this space, the agent becomes inherently vulnerable, exposing a fundamental alignment flaw in current multimodal representations.
%U https://aclanthology.org/2026.findings-acl.1230/
%P 24597-24615
Markdown (Informal)
[EVA: Evolving Semantic Adversaries for Red-Teaming GUI Agents Against Environmental Injection Attacks](https://aclanthology.org/2026.findings-acl.1230/) (Lu et al., Findings 2026)
ACL
- Yijie Lu, Manman Zhao, Tianjie Ju, Zihe Yan, Xinbei Ma, Yuan Guo, Daizong Ding, Gongshen Liu, and Zhuosheng Zhang. 2026. EVA: Evolving Semantic Adversaries for Red-Teaming GUI Agents Against Environmental Injection Attacks. In Findings of the Association for Computational Linguistics: ACL 2026, pages 24597–24615, San Diego, California, United States. Association for Computational Linguistics.