@inproceedings{liu-etal-2026-vida,
title = "{VIDA}: A Visual Intent-driven Design Assistant for Proactive Multimodal Clarification",
author = "Liu, Yanshan and
Zhang, Hongbo and
Sun, Zhen and
Wei, Jiaheng and
Wu, Kaishun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1687/",
pages = "33793--33804",
ISBN = "979-8-89176-395-1",
abstract = "In complex domains like interior design, user requests are often ambiguous and multimodal. Professional designers address this by asking strategic clarification questions based on hierarchical priorities, a capability lacking in current Vision-Language Models (VLMs). When fine-tuned on dialogue data, existing models often exhibit modality forgetting, overfitting to textual patterns while neglecting visual cues and thus producing hallucinated or visually irrelevant questions. To bridge this gap, we introduce VIDA (Visual Intent-driven Design Assistant), an assistant designed to generate proactive, visually grounded, and strategically prioritized clarification questions. Instead of standard fine-tuning, we propose a strategy-aware alignment framework that evolves from imitation learning to value-driven reinforcement. We utilize Group Sequence Policy Optimization to strictly enforce expert protocols, ensuring the model not only mimics fluent speech but also adheres to optimal inquiry strategies. Crucially, we design a novel hierarchical reward mechanism with Dynamic Intent Binding to align the assistant with professional prioritization standards. To facilitate this research, we construct and release InteriorClarify, a multimodal benchmark dataset comprising 1,016 real-world consultation cases annotated with this three-tier intent hierarchy. Extensive experiments demonstrate that VIDA sets a new state-of-the-art, improving the Strategic Alignment Score (SAS) by 20.59{\%} over SFT baselines and effectively restoring visual grounding capabilities lost during standard fine-tuning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-vida">
<titleInfo>
<title>VIDA: A Visual Intent-driven Design Assistant for Proactive Multimodal Clarification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanshan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongbo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaheng</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaishun</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>In complex domains like interior design, user requests are often ambiguous and multimodal. Professional designers address this by asking strategic clarification questions based on hierarchical priorities, a capability lacking in current Vision-Language Models (VLMs). When fine-tuned on dialogue data, existing models often exhibit modality forgetting, overfitting to textual patterns while neglecting visual cues and thus producing hallucinated or visually irrelevant questions. To bridge this gap, we introduce VIDA (Visual Intent-driven Design Assistant), an assistant designed to generate proactive, visually grounded, and strategically prioritized clarification questions. Instead of standard fine-tuning, we propose a strategy-aware alignment framework that evolves from imitation learning to value-driven reinforcement. We utilize Group Sequence Policy Optimization to strictly enforce expert protocols, ensuring the model not only mimics fluent speech but also adheres to optimal inquiry strategies. Crucially, we design a novel hierarchical reward mechanism with Dynamic Intent Binding to align the assistant with professional prioritization standards. To facilitate this research, we construct and release InteriorClarify, a multimodal benchmark dataset comprising 1,016 real-world consultation cases annotated with this three-tier intent hierarchy. Extensive experiments demonstrate that VIDA sets a new state-of-the-art, improving the Strategic Alignment Score (SAS) by 20.59% over SFT baselines and effectively restoring visual grounding capabilities lost during standard fine-tuning.</abstract>
<identifier type="citekey">liu-etal-2026-vida</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1687/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>33793</start>
<end>33804</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VIDA: A Visual Intent-driven Design Assistant for Proactive Multimodal Clarification
%A Liu, Yanshan
%A Zhang, Hongbo
%A Sun, Zhen
%A Wei, Jiaheng
%A Wu, Kaishun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F liu-etal-2026-vida
%X In complex domains like interior design, user requests are often ambiguous and multimodal. Professional designers address this by asking strategic clarification questions based on hierarchical priorities, a capability lacking in current Vision-Language Models (VLMs). When fine-tuned on dialogue data, existing models often exhibit modality forgetting, overfitting to textual patterns while neglecting visual cues and thus producing hallucinated or visually irrelevant questions. To bridge this gap, we introduce VIDA (Visual Intent-driven Design Assistant), an assistant designed to generate proactive, visually grounded, and strategically prioritized clarification questions. Instead of standard fine-tuning, we propose a strategy-aware alignment framework that evolves from imitation learning to value-driven reinforcement. We utilize Group Sequence Policy Optimization to strictly enforce expert protocols, ensuring the model not only mimics fluent speech but also adheres to optimal inquiry strategies. Crucially, we design a novel hierarchical reward mechanism with Dynamic Intent Binding to align the assistant with professional prioritization standards. To facilitate this research, we construct and release InteriorClarify, a multimodal benchmark dataset comprising 1,016 real-world consultation cases annotated with this three-tier intent hierarchy. Extensive experiments demonstrate that VIDA sets a new state-of-the-art, improving the Strategic Alignment Score (SAS) by 20.59% over SFT baselines and effectively restoring visual grounding capabilities lost during standard fine-tuning.
%U https://aclanthology.org/2026.findings-acl.1687/
%P 33793-33804
Markdown (Informal)
[VIDA: A Visual Intent-driven Design Assistant for Proactive Multimodal Clarification](https://aclanthology.org/2026.findings-acl.1687/) (Liu et al., Findings 2026)
ACL