@inproceedings{bhargava-etal-2023-referring,
title = "Referring to Screen Texts with Voice Assistants",
author = "Bhargava, Shruti and
Dhoot, Anand and
Jonsson, Ing-marie and
Nguyen, Hoang Long and
Patel, Alkesh and
Yu, Hong and
Renkens, Vincent",
editor = "Sitaram, Sunayana and
Beigman Klebanov, Beata and
Williams, Jason D",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-industry.72",
doi = "10.18653/v1/2023.acl-industry.72",
pages = "752--762",
abstract = "Voice assistants help users make phone calls, send messages, create events, navigate and do a lot more. However assistants have limited capacity to understand their users{'} context. In this work, we aim to take a step in this direction. Our work dives into a new experience for users to refer to phone numbers, addresses, email addresses, urls, and dates on their phone screens. We focus on reference understanding, which is particularly interesting when, similar to visual grounding, there are multiple similar texts on screen. We collect a dataset and propose a lightweight general purpose model for this novel experience. Since consuming pixels directly is expensive, our system is designed to rely only on text extracted from the UI. Our model is modular, offering flexibility, better interpretability and efficient run time memory.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bhargava-etal-2023-referring">
<titleInfo>
<title>Referring to Screen Texts with Voice Assistants</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Bhargava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="family">Dhoot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ing-marie</namePart>
<namePart type="family">Jonsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hoang</namePart>
<namePart type="given">Long</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alkesh</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Renkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beata</namePart>
<namePart type="family">Beigman Klebanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Voice assistants help users make phone calls, send messages, create events, navigate and do a lot more. However assistants have limited capacity to understand their users’ context. In this work, we aim to take a step in this direction. Our work dives into a new experience for users to refer to phone numbers, addresses, email addresses, urls, and dates on their phone screens. We focus on reference understanding, which is particularly interesting when, similar to visual grounding, there are multiple similar texts on screen. We collect a dataset and propose a lightweight general purpose model for this novel experience. Since consuming pixels directly is expensive, our system is designed to rely only on text extracted from the UI. Our model is modular, offering flexibility, better interpretability and efficient run time memory.</abstract>
<identifier type="citekey">bhargava-etal-2023-referring</identifier>
<identifier type="doi">10.18653/v1/2023.acl-industry.72</identifier>
<location>
<url>https://aclanthology.org/2023.acl-industry.72</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>752</start>
<end>762</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Referring to Screen Texts with Voice Assistants
%A Bhargava, Shruti
%A Dhoot, Anand
%A Jonsson, Ing-marie
%A Nguyen, Hoang Long
%A Patel, Alkesh
%A Yu, Hong
%A Renkens, Vincent
%Y Sitaram, Sunayana
%Y Beigman Klebanov, Beata
%Y Williams, Jason D.
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F bhargava-etal-2023-referring
%X Voice assistants help users make phone calls, send messages, create events, navigate and do a lot more. However assistants have limited capacity to understand their users’ context. In this work, we aim to take a step in this direction. Our work dives into a new experience for users to refer to phone numbers, addresses, email addresses, urls, and dates on their phone screens. We focus on reference understanding, which is particularly interesting when, similar to visual grounding, there are multiple similar texts on screen. We collect a dataset and propose a lightweight general purpose model for this novel experience. Since consuming pixels directly is expensive, our system is designed to rely only on text extracted from the UI. Our model is modular, offering flexibility, better interpretability and efficient run time memory.
%R 10.18653/v1/2023.acl-industry.72
%U https://aclanthology.org/2023.acl-industry.72
%U https://doi.org/10.18653/v1/2023.acl-industry.72
%P 752-762
Markdown (Informal)
[Referring to Screen Texts with Voice Assistants](https://aclanthology.org/2023.acl-industry.72) (Bhargava et al., ACL 2023)
ACL
- Shruti Bhargava, Anand Dhoot, Ing-marie Jonsson, Hoang Long Nguyen, Alkesh Patel, Hong Yu, and Vincent Renkens. 2023. Referring to Screen Texts with Voice Assistants. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track), pages 752–762, Toronto, Canada. Association for Computational Linguistics.