@inproceedings{yang-etal-2026-dmsd,
title = "{DMSD}: Dual-Modal Semantic Disentanglement for Compositional Zero-Shot Learning",
author = "Yang, Pan and
Yang, Jing and
li, Ruan Xiao and
Chen, Yuling and
Wu, Yuankai and
Zhou, Quan and
Wang, Xu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1540/",
pages = "30815--30826",
ISBN = "979-8-89176-395-1",
abstract = "The core challenge of Compositional Zero-Shot Learning (CZSL) lies in learning representations of sub-concepts (attributes and objects) from seen compositions and recognizing unseen novel compositions. Most existing CZSL methods primarily focus on prompt optimization on the textual side, while overlooking insufficient visual attribute{--}object sub-concepts disentanglement under a text-centric paradigm. To this end, we propose $DMSD$, a $D$ual-$M$odal $S$emantic $D$isentanglement framework that jointly models visual and textual information to achieve effective sub-concept disentanglement. Specifically, $DMSD$ introduces a $Contextual$ $Prompt$ $Space$, enabling both visual and textual modalities to be modeled under unified contextual semantic representations, thereby enhancing their alignment at the latent semantic level. Moreover, we design $Visual$ $Sub$-$concept$ $Prototypes$ that explicitly extract and model visual sub-concept features, improving the independence and discriminability of visual sub-concept representations. Furthermore, to achieve fine-grained alignment between visual and textual sub-concepts, we propose a $Class$-$Centroid$ $Bridging$ $Module$ that guides class centroids toward the textual semantic space, thereby ensuring cross-modal semantic consistency. Extensive experiments on three benchmark datasets (MIT-States, UT-Zappos, and C-GQA) demonstrate that $DMSD$ achieves state-of-the-art performance in both closed-world and open-world settings. Our code is available at https://anonymous.4open.science/r/DMSD-9CC4."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2026-dmsd">
<titleInfo>
<title>DMSD: Dual-Modal Semantic Disentanglement for Compositional Zero-Shot Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruan</namePart>
<namePart type="given">Xiao</namePart>
<namePart type="family">li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuling</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuankai</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The core challenge of Compositional Zero-Shot Learning (CZSL) lies in learning representations of sub-concepts (attributes and objects) from seen compositions and recognizing unseen novel compositions. Most existing CZSL methods primarily focus on prompt optimization on the textual side, while overlooking insufficient visual attribute–object sub-concepts disentanglement under a text-centric paradigm. To this end, we propose DMSD, a Dual-Modal Semantic Disentanglement framework that jointly models visual and textual information to achieve effective sub-concept disentanglement. Specifically, DMSD introduces a Contextual Prompt Space, enabling both visual and textual modalities to be modeled under unified contextual semantic representations, thereby enhancing their alignment at the latent semantic level. Moreover, we design Visual Sub-concept Prototypes that explicitly extract and model visual sub-concept features, improving the independence and discriminability of visual sub-concept representations. Furthermore, to achieve fine-grained alignment between visual and textual sub-concepts, we propose a Class-Centroid Bridging Module that guides class centroids toward the textual semantic space, thereby ensuring cross-modal semantic consistency. Extensive experiments on three benchmark datasets (MIT-States, UT-Zappos, and C-GQA) demonstrate that DMSD achieves state-of-the-art performance in both closed-world and open-world settings. Our code is available at https://anonymous.4open.science/r/DMSD-9CC4.</abstract>
<identifier type="citekey">yang-etal-2026-dmsd</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1540/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30815</start>
<end>30826</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DMSD: Dual-Modal Semantic Disentanglement for Compositional Zero-Shot Learning
%A Yang, Pan
%A Yang, Jing
%A li, Ruan Xiao
%A Chen, Yuling
%A Wu, Yuankai
%A Zhou, Quan
%A Wang, Xu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yang-etal-2026-dmsd
%X The core challenge of Compositional Zero-Shot Learning (CZSL) lies in learning representations of sub-concepts (attributes and objects) from seen compositions and recognizing unseen novel compositions. Most existing CZSL methods primarily focus on prompt optimization on the textual side, while overlooking insufficient visual attribute–object sub-concepts disentanglement under a text-centric paradigm. To this end, we propose DMSD, a Dual-Modal Semantic Disentanglement framework that jointly models visual and textual information to achieve effective sub-concept disentanglement. Specifically, DMSD introduces a Contextual Prompt Space, enabling both visual and textual modalities to be modeled under unified contextual semantic representations, thereby enhancing their alignment at the latent semantic level. Moreover, we design Visual Sub-concept Prototypes that explicitly extract and model visual sub-concept features, improving the independence and discriminability of visual sub-concept representations. Furthermore, to achieve fine-grained alignment between visual and textual sub-concepts, we propose a Class-Centroid Bridging Module that guides class centroids toward the textual semantic space, thereby ensuring cross-modal semantic consistency. Extensive experiments on three benchmark datasets (MIT-States, UT-Zappos, and C-GQA) demonstrate that DMSD achieves state-of-the-art performance in both closed-world and open-world settings. Our code is available at https://anonymous.4open.science/r/DMSD-9CC4.
%U https://aclanthology.org/2026.findings-acl.1540/
%P 30815-30826
Markdown (Informal)
[DMSD: Dual-Modal Semantic Disentanglement for Compositional Zero-Shot Learning](https://aclanthology.org/2026.findings-acl.1540/) (Yang et al., Findings 2026)
ACL
- Pan Yang, Jing Yang, Ruan Xiao li, Yuling Chen, Yuankai Wu, Quan Zhou, and Xu Wang. 2026. DMSD: Dual-Modal Semantic Disentanglement for Compositional Zero-Shot Learning. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30815–30826, San Diego, California, United States. Association for Computational Linguistics.