@inproceedings{bowden-walker-2025-dialogue,
title = "Dialogue Scaffolding: Producing a Realistic Corpus of Human-Computer Open-Domain Dialogues Using a Spoken Dialogue System and {C}hat{GPT}",
author = "Bowden, Kevin and
Walker, Marilyn",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.44/",
pages = "538--560",
abstract = "Researchers in dialogue interaction have had a long-term interest in multi-domain human-computer conversations and how they differ from human-human conversations. Recently, research on dialogue has begun to rely more and more on corpus-based training of neural conversational models, and conversational LLMs such as ChatGPT. However, existing large open-domain dialogue corpora do not accurately capture the characteristics of social human-computer dialogue. This paper addresses this gap by synthesizing a new corpus of 4000 long social dialogues on 200 user-model based topics that we call User-Centric SocialChat (UCSC). We create UCSC with a novel method called Dialogue Scaffolding, where a real dialogue system, that competed successfully in the Alexa Prize, interacts with ChatGPT to generate conversations. The Dialogue Scaffolding method ensures that the dialogues closely resemble the social chat genre of human-computer dialogues. We evaluate UCSC to ensure quality and safety, and we measure lexical diversity and topic consistency to show that the conversations are not repetitive and stay on topic. We evaluate the utility of UCSC by fine-tuning a compact dialogue-level model, PerQy-DLM, and showing that it outperforms competitive fine-tuned models like COSMO, Vicuna, and RedPajama-Chat-3B."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bowden-walker-2025-dialogue">
<titleInfo>
<title>Dialogue Scaffolding: Producing a Realistic Corpus of Human-Computer Open-Domain Dialogues Using a Spoken Dialogue System and ChatGPT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Bowden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marilyn</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Researchers in dialogue interaction have had a long-term interest in multi-domain human-computer conversations and how they differ from human-human conversations. Recently, research on dialogue has begun to rely more and more on corpus-based training of neural conversational models, and conversational LLMs such as ChatGPT. However, existing large open-domain dialogue corpora do not accurately capture the characteristics of social human-computer dialogue. This paper addresses this gap by synthesizing a new corpus of 4000 long social dialogues on 200 user-model based topics that we call User-Centric SocialChat (UCSC). We create UCSC with a novel method called Dialogue Scaffolding, where a real dialogue system, that competed successfully in the Alexa Prize, interacts with ChatGPT to generate conversations. The Dialogue Scaffolding method ensures that the dialogues closely resemble the social chat genre of human-computer dialogues. We evaluate UCSC to ensure quality and safety, and we measure lexical diversity and topic consistency to show that the conversations are not repetitive and stay on topic. We evaluate the utility of UCSC by fine-tuning a compact dialogue-level model, PerQy-DLM, and showing that it outperforms competitive fine-tuned models like COSMO, Vicuna, and RedPajama-Chat-3B.</abstract>
<identifier type="citekey">bowden-walker-2025-dialogue</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.44/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>538</start>
<end>560</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dialogue Scaffolding: Producing a Realistic Corpus of Human-Computer Open-Domain Dialogues Using a Spoken Dialogue System and ChatGPT
%A Bowden, Kevin
%A Walker, Marilyn
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F bowden-walker-2025-dialogue
%X Researchers in dialogue interaction have had a long-term interest in multi-domain human-computer conversations and how they differ from human-human conversations. Recently, research on dialogue has begun to rely more and more on corpus-based training of neural conversational models, and conversational LLMs such as ChatGPT. However, existing large open-domain dialogue corpora do not accurately capture the characteristics of social human-computer dialogue. This paper addresses this gap by synthesizing a new corpus of 4000 long social dialogues on 200 user-model based topics that we call User-Centric SocialChat (UCSC). We create UCSC with a novel method called Dialogue Scaffolding, where a real dialogue system, that competed successfully in the Alexa Prize, interacts with ChatGPT to generate conversations. The Dialogue Scaffolding method ensures that the dialogues closely resemble the social chat genre of human-computer dialogues. We evaluate UCSC to ensure quality and safety, and we measure lexical diversity and topic consistency to show that the conversations are not repetitive and stay on topic. We evaluate the utility of UCSC by fine-tuning a compact dialogue-level model, PerQy-DLM, and showing that it outperforms competitive fine-tuned models like COSMO, Vicuna, and RedPajama-Chat-3B.
%U https://aclanthology.org/2025.sigdial-1.44/
%P 538-560
Markdown (Informal)
[Dialogue Scaffolding: Producing a Realistic Corpus of Human-Computer Open-Domain Dialogues Using a Spoken Dialogue System and ChatGPT](https://aclanthology.org/2025.sigdial-1.44/) (Bowden & Walker, SIGDIAL 2025)
ACL