@inproceedings{hsiao-etal-2025-screenqa,
title = "{S}creen{QA}: Large-Scale Question-Answer Pairs Over Mobile App Screenshots",
author = "Hsiao, Yu-Chung and
Zubach, Fedir and
Baechler, Gilles and
Sunkara, Srinivas and
Carbune, Victor and
Lin, Jason and
Wang, Maria and
Zhu, Yun and
Chen, Jindong",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.477/",
doi = "10.18653/v1/2025.naacl-long.477",
pages = "9427--9452",
ISBN = "979-8-89176-189-6",
abstract = "We introduce ScreenQA, a novel benchmarking dataset designed to advance screen content understanding through question answering. The existing screen datasets are focused either on low-level structural and component understanding, or on a much higher-level composite task such as navigation and task completion for autonomous agents. ScreenQA attempts to bridge this gap. By annotating 86k question-answer pairs over the RICO dataset, we aim to benchmark the screen reading comprehension capacity, thereby laying the foundation for vision-based automation over screenshots. Our annotations encompass full answers, short answer phrases, and corresponding UI contents with bounding boxes, enabling four subtasks to address various application scenarios. We evaluate the dataset{'}s efficacy using both open-weight and proprietary models in zero-shot, fine-tuned, and transfer learning settings. We further demonstrate positive transfer to web applications, highlighting its potential beyond mobile applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hsiao-etal-2025-screenqa">
<titleInfo>
<title>ScreenQA: Large-Scale Question-Answer Pairs Over Mobile App Screenshots</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu-Chung</namePart>
<namePart type="family">Hsiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fedir</namePart>
<namePart type="family">Zubach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gilles</namePart>
<namePart type="family">Baechler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srinivas</namePart>
<namePart type="family">Sunkara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Carbune</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>We introduce ScreenQA, a novel benchmarking dataset designed to advance screen content understanding through question answering. The existing screen datasets are focused either on low-level structural and component understanding, or on a much higher-level composite task such as navigation and task completion for autonomous agents. ScreenQA attempts to bridge this gap. By annotating 86k question-answer pairs over the RICO dataset, we aim to benchmark the screen reading comprehension capacity, thereby laying the foundation for vision-based automation over screenshots. Our annotations encompass full answers, short answer phrases, and corresponding UI contents with bounding boxes, enabling four subtasks to address various application scenarios. We evaluate the dataset’s efficacy using both open-weight and proprietary models in zero-shot, fine-tuned, and transfer learning settings. We further demonstrate positive transfer to web applications, highlighting its potential beyond mobile applications.</abstract>
<identifier type="citekey">hsiao-etal-2025-screenqa</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.477</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.477/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>9427</start>
<end>9452</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ScreenQA: Large-Scale Question-Answer Pairs Over Mobile App Screenshots
%A Hsiao, Yu-Chung
%A Zubach, Fedir
%A Baechler, Gilles
%A Sunkara, Srinivas
%A Carbune, Victor
%A Lin, Jason
%A Wang, Maria
%A Zhu, Yun
%A Chen, Jindong
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F hsiao-etal-2025-screenqa
%X We introduce ScreenQA, a novel benchmarking dataset designed to advance screen content understanding through question answering. The existing screen datasets are focused either on low-level structural and component understanding, or on a much higher-level composite task such as navigation and task completion for autonomous agents. ScreenQA attempts to bridge this gap. By annotating 86k question-answer pairs over the RICO dataset, we aim to benchmark the screen reading comprehension capacity, thereby laying the foundation for vision-based automation over screenshots. Our annotations encompass full answers, short answer phrases, and corresponding UI contents with bounding boxes, enabling four subtasks to address various application scenarios. We evaluate the dataset’s efficacy using both open-weight and proprietary models in zero-shot, fine-tuned, and transfer learning settings. We further demonstrate positive transfer to web applications, highlighting its potential beyond mobile applications.
%R 10.18653/v1/2025.naacl-long.477
%U https://aclanthology.org/2025.naacl-long.477/
%U https://doi.org/10.18653/v1/2025.naacl-long.477
%P 9427-9452
Markdown (Informal)
[ScreenQA: Large-Scale Question-Answer Pairs Over Mobile App Screenshots](https://aclanthology.org/2025.naacl-long.477/) (Hsiao et al., NAACL 2025)
ACL
- Yu-Chung Hsiao, Fedir Zubach, Gilles Baechler, Srinivas Sunkara, Victor Carbune, Jason Lin, Maria Wang, Yun Zhu, and Jindong Chen. 2025. ScreenQA: Large-Scale Question-Answer Pairs Over Mobile App Screenshots. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 9427–9452, Albuquerque, New Mexico. Association for Computational Linguistics.