@inproceedings{mccarthy-etal-2025-mrcad,
title = "mr{CAD}: Multimodal Communication to Refine Computer-aided Designs",
author = "McCarthy, William P and
Vaduguru, Saujas and
Willis, Karl D.d. and
Matejka, Justin and
Fan, Judith E and
Fried, Daniel and
Pu, Yewen",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1248/",
pages = "22905--22921",
ISBN = "979-8-89176-335-7",
abstract = "In collaborative creation tasks, people steer artifacts towards specific goals by {\_}refining{\_} them with {\_}multimodal{\_} communication over multiple rounds of interaction. In contrast, generative AI excels at creating artifacts in a single turn but can struggle to make precise refinements that match our design intent. To close this gap, we present mrCAD, a dataset of multi-turn interactions in which pairs of humans iteratively created and refined computer-aided designs (CADs). In each game, a {\_}Designer sent instructions to a {\_}Maker{\_}, explaining how to create and subsequently refine a CAD to match a target design that only the {\_}Designer{\_} could see. mrCAD consists of 6,082 communication games, 15,163 instruction-execution rounds, played between 1,092 pairs of human players. Crucially, {\_}Designers{\_} had access to two communication modalities {--} text and drawing. Analysis finds that players relied more on text in refinement than in initial generation instructions, and used different linguistic elements for refinement than for generation. We also find that state-of-the-art VLMs are better at following generation instructions than refinement instructions. These results lay the foundation for modeling multi-turn, multimodal communication not captured in prior datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mccarthy-etal-2025-mrcad">
<titleInfo>
<title>mrCAD: Multimodal Communication to Refine Computer-aided Designs</title>
</titleInfo>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">P</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saujas</namePart>
<namePart type="family">Vaduguru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karl</namePart>
<namePart type="given">D.d.</namePart>
<namePart type="family">Willis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Matejka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Judith</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Fried</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yewen</namePart>
<namePart type="family">Pu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>In collaborative creation tasks, people steer artifacts towards specific goals by _refining_ them with _multimodal_ communication over multiple rounds of interaction. In contrast, generative AI excels at creating artifacts in a single turn but can struggle to make precise refinements that match our design intent. To close this gap, we present mrCAD, a dataset of multi-turn interactions in which pairs of humans iteratively created and refined computer-aided designs (CADs). In each game, a _Designer sent instructions to a _Maker_, explaining how to create and subsequently refine a CAD to match a target design that only the _Designer_ could see. mrCAD consists of 6,082 communication games, 15,163 instruction-execution rounds, played between 1,092 pairs of human players. Crucially, _Designers_ had access to two communication modalities – text and drawing. Analysis finds that players relied more on text in refinement than in initial generation instructions, and used different linguistic elements for refinement than for generation. We also find that state-of-the-art VLMs are better at following generation instructions than refinement instructions. These results lay the foundation for modeling multi-turn, multimodal communication not captured in prior datasets.</abstract>
<identifier type="citekey">mccarthy-etal-2025-mrcad</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1248/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>22905</start>
<end>22921</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T mrCAD: Multimodal Communication to Refine Computer-aided Designs
%A McCarthy, William P.
%A Vaduguru, Saujas
%A Willis, Karl D.d.
%A Matejka, Justin
%A Fan, Judith E.
%A Fried, Daniel
%A Pu, Yewen
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F mccarthy-etal-2025-mrcad
%X In collaborative creation tasks, people steer artifacts towards specific goals by _refining_ them with _multimodal_ communication over multiple rounds of interaction. In contrast, generative AI excels at creating artifacts in a single turn but can struggle to make precise refinements that match our design intent. To close this gap, we present mrCAD, a dataset of multi-turn interactions in which pairs of humans iteratively created and refined computer-aided designs (CADs). In each game, a _Designer sent instructions to a _Maker_, explaining how to create and subsequently refine a CAD to match a target design that only the _Designer_ could see. mrCAD consists of 6,082 communication games, 15,163 instruction-execution rounds, played between 1,092 pairs of human players. Crucially, _Designers_ had access to two communication modalities – text and drawing. Analysis finds that players relied more on text in refinement than in initial generation instructions, and used different linguistic elements for refinement than for generation. We also find that state-of-the-art VLMs are better at following generation instructions than refinement instructions. These results lay the foundation for modeling multi-turn, multimodal communication not captured in prior datasets.
%U https://aclanthology.org/2025.findings-emnlp.1248/
%P 22905-22921
Markdown (Informal)
[mrCAD: Multimodal Communication to Refine Computer-aided Designs](https://aclanthology.org/2025.findings-emnlp.1248/) (McCarthy et al., Findings 2025)
ACL
- William P McCarthy, Saujas Vaduguru, Karl D.d. Willis, Justin Matejka, Judith E Fan, Daniel Fried, and Yewen Pu. 2025. mrCAD: Multimodal Communication to Refine Computer-aided Designs. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 22905–22921, Suzhou, China. Association for Computational Linguistics.