@inproceedings{chen-etal-2024-dual-oriented,
title = "Dual-oriented Disentangled Network with Counterfactual Intervention for Multimodal Intent Detection",
author = "Chen, Zhanpeng and
Zhu, Zhihong and
Zhuang, Xianwei and
Huang, Zhiqi and
Zou, Yuexian",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.972",
pages = "17554--17567",
abstract = "Multimodal intent detection is designed to leverage diverse modalities for a comprehensive understanding of user intentions in real-world scenarios, thus playing a critical role in modern task-oriented dialogue systems. Existing methods have made great progress in modal alignment and fusion, however, two vital limitations are neglected: (I) close entanglement of multimodal semantics with modal structures; (II) insufficient learning of the causal effects of semantic and modality-specific information on the final predictions under the end-to-end training fashion. To alleviate the above limitations, we introduce the Dual-oriented Disentangled Network with Counterfactual Intervention (DuoDN). DuoDN addresses key limitations in current systems by effectively disentangling and utilizing modality-specific and multimodal semantic information. The model consists of a Dual-oriented Disentangled Encoder that decouples semantics-oriented and modality-oriented representations, alongside a Counterfactual Intervention Module that applies causal inference to understand causal effects by injecting confounders. Experiments on three benchmark datasets demonstrate DuoDN{'}s superiority over existing methods, with extensive analysis validating its advantages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-dual-oriented">
<titleInfo>
<title>Dual-oriented Disentangled Network with Counterfactual Intervention for Multimodal Intent Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhanpeng</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhihong</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianwei</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiqi</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuexian</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal intent detection is designed to leverage diverse modalities for a comprehensive understanding of user intentions in real-world scenarios, thus playing a critical role in modern task-oriented dialogue systems. Existing methods have made great progress in modal alignment and fusion, however, two vital limitations are neglected: (I) close entanglement of multimodal semantics with modal structures; (II) insufficient learning of the causal effects of semantic and modality-specific information on the final predictions under the end-to-end training fashion. To alleviate the above limitations, we introduce the Dual-oriented Disentangled Network with Counterfactual Intervention (DuoDN). DuoDN addresses key limitations in current systems by effectively disentangling and utilizing modality-specific and multimodal semantic information. The model consists of a Dual-oriented Disentangled Encoder that decouples semantics-oriented and modality-oriented representations, alongside a Counterfactual Intervention Module that applies causal inference to understand causal effects by injecting confounders. Experiments on three benchmark datasets demonstrate DuoDN’s superiority over existing methods, with extensive analysis validating its advantages.</abstract>
<identifier type="citekey">chen-etal-2024-dual-oriented</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.972</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>17554</start>
<end>17567</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dual-oriented Disentangled Network with Counterfactual Intervention for Multimodal Intent Detection
%A Chen, Zhanpeng
%A Zhu, Zhihong
%A Zhuang, Xianwei
%A Huang, Zhiqi
%A Zou, Yuexian
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chen-etal-2024-dual-oriented
%X Multimodal intent detection is designed to leverage diverse modalities for a comprehensive understanding of user intentions in real-world scenarios, thus playing a critical role in modern task-oriented dialogue systems. Existing methods have made great progress in modal alignment and fusion, however, two vital limitations are neglected: (I) close entanglement of multimodal semantics with modal structures; (II) insufficient learning of the causal effects of semantic and modality-specific information on the final predictions under the end-to-end training fashion. To alleviate the above limitations, we introduce the Dual-oriented Disentangled Network with Counterfactual Intervention (DuoDN). DuoDN addresses key limitations in current systems by effectively disentangling and utilizing modality-specific and multimodal semantic information. The model consists of a Dual-oriented Disentangled Encoder that decouples semantics-oriented and modality-oriented representations, alongside a Counterfactual Intervention Module that applies causal inference to understand causal effects by injecting confounders. Experiments on three benchmark datasets demonstrate DuoDN’s superiority over existing methods, with extensive analysis validating its advantages.
%U https://aclanthology.org/2024.emnlp-main.972
%P 17554-17567
Markdown (Informal)
[Dual-oriented Disentangled Network with Counterfactual Intervention for Multimodal Intent Detection](https://aclanthology.org/2024.emnlp-main.972) (Chen et al., EMNLP 2024)
ACL