@inproceedings{doran-dahl-2024-lamppost,
title = "It{'}s Not under the Lamppost: Expanding the Reach of Conversational {AI}",
author = "Doran, Christy and
Dahl, Deborah A.",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.825",
pages = "9441--9451",
abstract = "Generic commercial language-based assistants have become ubiquitously available, originally in the form of smart speakers and mobile apps, and more recently in the form of systems based on generative AI. At first glance, their capabilities seem remarkable. Speech recognition works well, NLU mostly works, and access to back-end information sources is usually quite good. However, there is still a lot of work to be done. In the area of NLU in particular, focused probes into the capabilities of language-based assistants easily reveal significant areas of brittleness that demonstrate large gaps in their coverage. For example, the straightforward disjunctive query \textit{is this monday or tuesday} elicited the nonsensical response \textit{it{'}s 2:50 p.m. many consider it to be the afternoon}. These gaps are difficult to identify if the development process relies on training the system with an ongoing supply of natural user data, because this natural data can become distorted by a self-reinforcing feedback loop where the system {`}trains{'} the user to produce data that works. This paper describes a process for collecting specific kinds of data to uncover these gaps and an annotation scheme for system responses, and includes examples of simple utterances that nonetheless fail to be correctly processed. The systems tested include both Conventional assistants, such as Amazon Alexa and Google Assistant, as well as GenAI systems, including ChatGPT and Bard/Gemini. We claim that these failures are due to a lack of attention to the full spectrum of input possibilities, and argue that systems would benefit from the inclusion of focused manual assessment to directly target likely gaps.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="doran-dahl-2024-lamppost">
<titleInfo>
<title>It’s Not under the Lamppost: Expanding the Reach of Conversational AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christy</namePart>
<namePart type="family">Doran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deborah</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Dahl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generic commercial language-based assistants have become ubiquitously available, originally in the form of smart speakers and mobile apps, and more recently in the form of systems based on generative AI. At first glance, their capabilities seem remarkable. Speech recognition works well, NLU mostly works, and access to back-end information sources is usually quite good. However, there is still a lot of work to be done. In the area of NLU in particular, focused probes into the capabilities of language-based assistants easily reveal significant areas of brittleness that demonstrate large gaps in their coverage. For example, the straightforward disjunctive query is this monday or tuesday elicited the nonsensical response it’s 2:50 p.m. many consider it to be the afternoon. These gaps are difficult to identify if the development process relies on training the system with an ongoing supply of natural user data, because this natural data can become distorted by a self-reinforcing feedback loop where the system ‘trains’ the user to produce data that works. This paper describes a process for collecting specific kinds of data to uncover these gaps and an annotation scheme for system responses, and includes examples of simple utterances that nonetheless fail to be correctly processed. The systems tested include both Conventional assistants, such as Amazon Alexa and Google Assistant, as well as GenAI systems, including ChatGPT and Bard/Gemini. We claim that these failures are due to a lack of attention to the full spectrum of input possibilities, and argue that systems would benefit from the inclusion of focused manual assessment to directly target likely gaps.</abstract>
<identifier type="citekey">doran-dahl-2024-lamppost</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.825</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>9441</start>
<end>9451</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T It’s Not under the Lamppost: Expanding the Reach of Conversational AI
%A Doran, Christy
%A Dahl, Deborah A.
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F doran-dahl-2024-lamppost
%X Generic commercial language-based assistants have become ubiquitously available, originally in the form of smart speakers and mobile apps, and more recently in the form of systems based on generative AI. At first glance, their capabilities seem remarkable. Speech recognition works well, NLU mostly works, and access to back-end information sources is usually quite good. However, there is still a lot of work to be done. In the area of NLU in particular, focused probes into the capabilities of language-based assistants easily reveal significant areas of brittleness that demonstrate large gaps in their coverage. For example, the straightforward disjunctive query is this monday or tuesday elicited the nonsensical response it’s 2:50 p.m. many consider it to be the afternoon. These gaps are difficult to identify if the development process relies on training the system with an ongoing supply of natural user data, because this natural data can become distorted by a self-reinforcing feedback loop where the system ‘trains’ the user to produce data that works. This paper describes a process for collecting specific kinds of data to uncover these gaps and an annotation scheme for system responses, and includes examples of simple utterances that nonetheless fail to be correctly processed. The systems tested include both Conventional assistants, such as Amazon Alexa and Google Assistant, as well as GenAI systems, including ChatGPT and Bard/Gemini. We claim that these failures are due to a lack of attention to the full spectrum of input possibilities, and argue that systems would benefit from the inclusion of focused manual assessment to directly target likely gaps.
%U https://aclanthology.org/2024.lrec-main.825
%P 9441-9451
Markdown (Informal)
[It’s Not under the Lamppost: Expanding the Reach of Conversational AI](https://aclanthology.org/2024.lrec-main.825) (Doran & Dahl, LREC-COLING 2024)
ACL