@inproceedings{sravanthi-etal-2025-perception,
title = "From Perception to Reasoning: Enhancing Vision-Language Models for Mobile {UI} Understanding",
author = "Sravanthi, Settaluri Lakshmi and
Mishra, Ankit and
Mondal, Debjyoti and
Panda, Subhadarshi and
Singh, Rituraj and
Bhattacharyya, Pushpak",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1295/",
doi = "10.18653/v1/2025.findings-acl.1295",
pages = "25250--25269",
ISBN = "979-8-89176-256-5",
abstract = "Accurately grounding visual and textual elements within mobile user interfaces (UIs) remains a significant challenge for Vision-Language Models (VLMs). Visual grounding, a critical task in this domain, involves identifying the most relevant UI element or region based on a natural language query{---}a process that requires both precise perception and context-aware reasoning. In this work, we present - **MoUI**, a light-weight mobile UI understanding model trained on **MoIT**, an instruction-tuning dataset specifically tailored for mobile screen understanding and grounding, designed to bridge the gap between user intent and visual semantics. Complementing this dataset, we also present a human-annotated reasoning benchmark **MoIQ** that rigorously evaluates complex inference capabilities over mobile UIs. To harness these resources effectively, we propose a two-stage training approach that separately addresses perception and reasoning tasks, leading to stronger perception capabilities and improvement in reasoning abilities. Through extensive experiments, we demonstrate that our MoUI models achieve significant gains in accuracy across all perception tasks and {\_}state-of-the-art{\_} results on public reasoning benchmark **ComplexQA (78{\%}) and our MoIQ (49{\%})**. We will be open-sourcing our dataset, code, and models to foster further research and innovation in the field."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sravanthi-etal-2025-perception">
<titleInfo>
<title>From Perception to Reasoning: Enhancing Vision-Language Models for Mobile UI Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Settaluri</namePart>
<namePart type="given">Lakshmi</namePart>
<namePart type="family">Sravanthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankit</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debjyoti</namePart>
<namePart type="family">Mondal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subhadarshi</namePart>
<namePart type="family">Panda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rituraj</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Accurately grounding visual and textual elements within mobile user interfaces (UIs) remains a significant challenge for Vision-Language Models (VLMs). Visual grounding, a critical task in this domain, involves identifying the most relevant UI element or region based on a natural language query—a process that requires both precise perception and context-aware reasoning. In this work, we present - **MoUI**, a light-weight mobile UI understanding model trained on **MoIT**, an instruction-tuning dataset specifically tailored for mobile screen understanding and grounding, designed to bridge the gap between user intent and visual semantics. Complementing this dataset, we also present a human-annotated reasoning benchmark **MoIQ** that rigorously evaluates complex inference capabilities over mobile UIs. To harness these resources effectively, we propose a two-stage training approach that separately addresses perception and reasoning tasks, leading to stronger perception capabilities and improvement in reasoning abilities. Through extensive experiments, we demonstrate that our MoUI models achieve significant gains in accuracy across all perception tasks and _state-of-the-art_ results on public reasoning benchmark **ComplexQA (78%) and our MoIQ (49%)**. We will be open-sourcing our dataset, code, and models to foster further research and innovation in the field.</abstract>
<identifier type="citekey">sravanthi-etal-2025-perception</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1295</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1295/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25250</start>
<end>25269</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Perception to Reasoning: Enhancing Vision-Language Models for Mobile UI Understanding
%A Sravanthi, Settaluri Lakshmi
%A Mishra, Ankit
%A Mondal, Debjyoti
%A Panda, Subhadarshi
%A Singh, Rituraj
%A Bhattacharyya, Pushpak
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F sravanthi-etal-2025-perception
%X Accurately grounding visual and textual elements within mobile user interfaces (UIs) remains a significant challenge for Vision-Language Models (VLMs). Visual grounding, a critical task in this domain, involves identifying the most relevant UI element or region based on a natural language query—a process that requires both precise perception and context-aware reasoning. In this work, we present - **MoUI**, a light-weight mobile UI understanding model trained on **MoIT**, an instruction-tuning dataset specifically tailored for mobile screen understanding and grounding, designed to bridge the gap between user intent and visual semantics. Complementing this dataset, we also present a human-annotated reasoning benchmark **MoIQ** that rigorously evaluates complex inference capabilities over mobile UIs. To harness these resources effectively, we propose a two-stage training approach that separately addresses perception and reasoning tasks, leading to stronger perception capabilities and improvement in reasoning abilities. Through extensive experiments, we demonstrate that our MoUI models achieve significant gains in accuracy across all perception tasks and _state-of-the-art_ results on public reasoning benchmark **ComplexQA (78%) and our MoIQ (49%)**. We will be open-sourcing our dataset, code, and models to foster further research and innovation in the field.
%R 10.18653/v1/2025.findings-acl.1295
%U https://aclanthology.org/2025.findings-acl.1295/
%U https://doi.org/10.18653/v1/2025.findings-acl.1295
%P 25250-25269
Markdown (Informal)
[From Perception to Reasoning: Enhancing Vision-Language Models for Mobile UI Understanding](https://aclanthology.org/2025.findings-acl.1295/) (Sravanthi et al., Findings 2025)
ACL