@inproceedings{gubbi-venkatesh-etal-2024-ugif,
title = "{UGIF}-{D}ata{S}et: A New Dataset for Cross-lingual, Cross-modal Sequential actions on the {UI}",
author = "Gubbi Venkatesh, Sagar and
Talukdar, Partha and
Narayanan, Srini",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.89",
doi = "10.18653/v1/2024.findings-naacl.89",
pages = "1390--1399",
abstract = "Help documents are supposed to aid smartphone users in resolving queries such as {``}How to block calls from unknown numbers?{''}. However, given a query, identifying the right help document, understanding instructions from the document, and using them to resolve the issue at hand is challenging. The user experience may be enhanced by converting the instructions in the help document to a step-by-step tutorial overlaid on the phone UI. Successful execution of this task requires overcoming research challenges in retrieval, parsing, and grounding in the multilingual-multimodal setting. For example, user queries in one language may have to be matched against instructions in another language, which in turn needs to be grounded in a multimodal UI in yet another language. Moreover, there isn{'}t any relevant dataset for such a task. In order to bridge this gap, we introduce UGIF-DataSet, a multi-lingual, multi-modal UI grounded dataset for step-by-step task completion on the smartphone, containing 4,184 tasks across 8 languages. The instruction steps in UGIF-DataSet are available only in English, so the challenge involves operations in the cross-modal, cross-lingual setting. We compare the performance of different large language models for this task and find that the end-to-end task completion rate drops from 48{\%} in English to 32{\%} for other languages, demonstrating significant overall headroom for improvement. We are hopeful that UGIF-DataSet and our analysis will aid further research on the important problem of sequential task completion in the multilingual and multimodal setting.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gubbi-venkatesh-etal-2024-ugif">
<titleInfo>
<title>UGIF-DataSet: A New Dataset for Cross-lingual, Cross-modal Sequential actions on the UI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sagar</namePart>
<namePart type="family">Gubbi Venkatesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Partha</namePart>
<namePart type="family">Talukdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srini</namePart>
<namePart type="family">Narayanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Help documents are supposed to aid smartphone users in resolving queries such as “How to block calls from unknown numbers?”. However, given a query, identifying the right help document, understanding instructions from the document, and using them to resolve the issue at hand is challenging. The user experience may be enhanced by converting the instructions in the help document to a step-by-step tutorial overlaid on the phone UI. Successful execution of this task requires overcoming research challenges in retrieval, parsing, and grounding in the multilingual-multimodal setting. For example, user queries in one language may have to be matched against instructions in another language, which in turn needs to be grounded in a multimodal UI in yet another language. Moreover, there isn’t any relevant dataset for such a task. In order to bridge this gap, we introduce UGIF-DataSet, a multi-lingual, multi-modal UI grounded dataset for step-by-step task completion on the smartphone, containing 4,184 tasks across 8 languages. The instruction steps in UGIF-DataSet are available only in English, so the challenge involves operations in the cross-modal, cross-lingual setting. We compare the performance of different large language models for this task and find that the end-to-end task completion rate drops from 48% in English to 32% for other languages, demonstrating significant overall headroom for improvement. We are hopeful that UGIF-DataSet and our analysis will aid further research on the important problem of sequential task completion in the multilingual and multimodal setting.</abstract>
<identifier type="citekey">gubbi-venkatesh-etal-2024-ugif</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.89</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.89</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>1390</start>
<end>1399</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UGIF-DataSet: A New Dataset for Cross-lingual, Cross-modal Sequential actions on the UI
%A Gubbi Venkatesh, Sagar
%A Talukdar, Partha
%A Narayanan, Srini
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F gubbi-venkatesh-etal-2024-ugif
%X Help documents are supposed to aid smartphone users in resolving queries such as “How to block calls from unknown numbers?”. However, given a query, identifying the right help document, understanding instructions from the document, and using them to resolve the issue at hand is challenging. The user experience may be enhanced by converting the instructions in the help document to a step-by-step tutorial overlaid on the phone UI. Successful execution of this task requires overcoming research challenges in retrieval, parsing, and grounding in the multilingual-multimodal setting. For example, user queries in one language may have to be matched against instructions in another language, which in turn needs to be grounded in a multimodal UI in yet another language. Moreover, there isn’t any relevant dataset for such a task. In order to bridge this gap, we introduce UGIF-DataSet, a multi-lingual, multi-modal UI grounded dataset for step-by-step task completion on the smartphone, containing 4,184 tasks across 8 languages. The instruction steps in UGIF-DataSet are available only in English, so the challenge involves operations in the cross-modal, cross-lingual setting. We compare the performance of different large language models for this task and find that the end-to-end task completion rate drops from 48% in English to 32% for other languages, demonstrating significant overall headroom for improvement. We are hopeful that UGIF-DataSet and our analysis will aid further research on the important problem of sequential task completion in the multilingual and multimodal setting.
%R 10.18653/v1/2024.findings-naacl.89
%U https://aclanthology.org/2024.findings-naacl.89
%U https://doi.org/10.18653/v1/2024.findings-naacl.89
%P 1390-1399
Markdown (Informal)
[UGIF-DataSet: A New Dataset for Cross-lingual, Cross-modal Sequential actions on the UI](https://aclanthology.org/2024.findings-naacl.89) (Gubbi Venkatesh et al., Findings 2024)
ACL