@inproceedings{miah-etal-2024-multimodal,
title = "Multimodal Contextual Dialogue Breakdown Detection for Conversational {AI} Models",
author = "Miah, Md Messal Monem and
Schnaithmann, Ulie and
Raghuvanshi, Arushi and
Son, Youngseo",
editor = "Yang, Yi and
Davani, Aida and
Sil, Avi and
Kumar, Anoop",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 6: Industry Track)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-industry.25",
doi = "10.18653/v1/2024.naacl-industry.25",
pages = "303--314",
abstract = "Detecting dialogue breakdown in real time is critical for conversational AI systems, because it enables taking corrective action to successfully complete a task. In spoken dialog systems, this breakdown can be caused by a variety of unexpected situations including high levels of background noise, causing STT mistranscriptions, or unexpected user flows.In particular, industry settings like healthcare, require high precision and high flexibility to navigate differently based on the conversation history and dialogue states. This makes it both more challenging and more critical to accurately detect dialog breakdown. To accurately detect breakdown, we found it requires processing audio inputs along with downstream NLP model inferences on transcribed text in real time. In this paper, we introduce a Multimodal Contextual Dialogue Breakdown (MultConDB) model. This model significantly outperforms other known best models by achieving an F1 of 69.27.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miah-etal-2024-multimodal">
<titleInfo>
<title>Multimodal Contextual Dialogue Breakdown Detection for Conversational AI Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Messal</namePart>
<namePart type="given">Monem</namePart>
<namePart type="family">Miah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulie</namePart>
<namePart type="family">Schnaithmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arushi</namePart>
<namePart type="family">Raghuvanshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngseo</namePart>
<namePart type="family">Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aida</namePart>
<namePart type="family">Davani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avi</namePart>
<namePart type="family">Sil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Detecting dialogue breakdown in real time is critical for conversational AI systems, because it enables taking corrective action to successfully complete a task. In spoken dialog systems, this breakdown can be caused by a variety of unexpected situations including high levels of background noise, causing STT mistranscriptions, or unexpected user flows.In particular, industry settings like healthcare, require high precision and high flexibility to navigate differently based on the conversation history and dialogue states. This makes it both more challenging and more critical to accurately detect dialog breakdown. To accurately detect breakdown, we found it requires processing audio inputs along with downstream NLP model inferences on transcribed text in real time. In this paper, we introduce a Multimodal Contextual Dialogue Breakdown (MultConDB) model. This model significantly outperforms other known best models by achieving an F1 of 69.27.</abstract>
<identifier type="citekey">miah-etal-2024-multimodal</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-industry.25</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-industry.25</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>303</start>
<end>314</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Contextual Dialogue Breakdown Detection for Conversational AI Models
%A Miah, Md Messal Monem
%A Schnaithmann, Ulie
%A Raghuvanshi, Arushi
%A Son, Youngseo
%Y Yang, Yi
%Y Davani, Aida
%Y Sil, Avi
%Y Kumar, Anoop
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 6: Industry Track)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F miah-etal-2024-multimodal
%X Detecting dialogue breakdown in real time is critical for conversational AI systems, because it enables taking corrective action to successfully complete a task. In spoken dialog systems, this breakdown can be caused by a variety of unexpected situations including high levels of background noise, causing STT mistranscriptions, or unexpected user flows.In particular, industry settings like healthcare, require high precision and high flexibility to navigate differently based on the conversation history and dialogue states. This makes it both more challenging and more critical to accurately detect dialog breakdown. To accurately detect breakdown, we found it requires processing audio inputs along with downstream NLP model inferences on transcribed text in real time. In this paper, we introduce a Multimodal Contextual Dialogue Breakdown (MultConDB) model. This model significantly outperforms other known best models by achieving an F1 of 69.27.
%R 10.18653/v1/2024.naacl-industry.25
%U https://aclanthology.org/2024.naacl-industry.25
%U https://doi.org/10.18653/v1/2024.naacl-industry.25
%P 303-314
Markdown (Informal)
[Multimodal Contextual Dialogue Breakdown Detection for Conversational AI Models](https://aclanthology.org/2024.naacl-industry.25) (Miah et al., NAACL 2024)
ACL
- Md Messal Monem Miah, Ulie Schnaithmann, Arushi Raghuvanshi, and Youngseo Son. 2024. Multimodal Contextual Dialogue Breakdown Detection for Conversational AI Models. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 6: Industry Track), pages 303–314, Mexico City, Mexico. Association for Computational Linguistics.