@inproceedings{njifenjou-etal-2025-open,
title = "Open-Source Large Language Models as Multilingual Crowdworkers: Synthesizing Open-Domain Dialogues in Several Languages With No Examples in Targets and No Machine Translation",
author = "Njifenjou, Ahmed and
Sucal, Virgile and
Jabaian, Bassam and
Lef{\`e}vre, Fabrice",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.55/",
pages = "697--749",
abstract = "The prevailing paradigm in the field of Open-Domain Dialogue (ODD) agents predominantly focuses on some high-resource languages such as English or Chinese. Furthermore, the financial and temporal investments required for crowd-sourcing such datasets, in multiple languages, are substantial. Fortunately, advancements in Large Language Models (LLMs), specifically instruction-tuning enabled them to execute tasks based on natural language instructions. Additionally, these models possess the capability to function in various languages within a single thread. Consequently, to generate new data samples in different languages, we propose leveraging these capabilities to replicate the data collection process. We introduce a pipeline for generating ODD data in multiple target languages using LLMs, with demonstrations provided in a unique source language. By eschewing explicit Machine Translation in this approach, we enhance language-specific nuances and cultural specificity. We apply this methodology to the PersonaChat dataset. To further improve the openness of generated dialogues and mimic real life scenarios, we added the notion of speech events corresponding to the type of conversation the speakers are involved in and that of common ground which represents the premises of a conversation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="njifenjou-etal-2025-open">
<titleInfo>
<title>Open-Source Large Language Models as Multilingual Crowdworkers: Synthesizing Open-Domain Dialogues in Several Languages With No Examples in Targets and No Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Njifenjou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Virgile</namePart>
<namePart type="family">Sucal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bassam</namePart>
<namePart type="family">Jabaian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The prevailing paradigm in the field of Open-Domain Dialogue (ODD) agents predominantly focuses on some high-resource languages such as English or Chinese. Furthermore, the financial and temporal investments required for crowd-sourcing such datasets, in multiple languages, are substantial. Fortunately, advancements in Large Language Models (LLMs), specifically instruction-tuning enabled them to execute tasks based on natural language instructions. Additionally, these models possess the capability to function in various languages within a single thread. Consequently, to generate new data samples in different languages, we propose leveraging these capabilities to replicate the data collection process. We introduce a pipeline for generating ODD data in multiple target languages using LLMs, with demonstrations provided in a unique source language. By eschewing explicit Machine Translation in this approach, we enhance language-specific nuances and cultural specificity. We apply this methodology to the PersonaChat dataset. To further improve the openness of generated dialogues and mimic real life scenarios, we added the notion of speech events corresponding to the type of conversation the speakers are involved in and that of common ground which represents the premises of a conversation.</abstract>
<identifier type="citekey">njifenjou-etal-2025-open</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.55/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>697</start>
<end>749</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Open-Source Large Language Models as Multilingual Crowdworkers: Synthesizing Open-Domain Dialogues in Several Languages With No Examples in Targets and No Machine Translation
%A Njifenjou, Ahmed
%A Sucal, Virgile
%A Jabaian, Bassam
%A Lefèvre, Fabrice
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F njifenjou-etal-2025-open
%X The prevailing paradigm in the field of Open-Domain Dialogue (ODD) agents predominantly focuses on some high-resource languages such as English or Chinese. Furthermore, the financial and temporal investments required for crowd-sourcing such datasets, in multiple languages, are substantial. Fortunately, advancements in Large Language Models (LLMs), specifically instruction-tuning enabled them to execute tasks based on natural language instructions. Additionally, these models possess the capability to function in various languages within a single thread. Consequently, to generate new data samples in different languages, we propose leveraging these capabilities to replicate the data collection process. We introduce a pipeline for generating ODD data in multiple target languages using LLMs, with demonstrations provided in a unique source language. By eschewing explicit Machine Translation in this approach, we enhance language-specific nuances and cultural specificity. We apply this methodology to the PersonaChat dataset. To further improve the openness of generated dialogues and mimic real life scenarios, we added the notion of speech events corresponding to the type of conversation the speakers are involved in and that of common ground which represents the premises of a conversation.
%U https://aclanthology.org/2025.sigdial-1.55/
%P 697-749
Markdown (Informal)
[Open-Source Large Language Models as Multilingual Crowdworkers: Synthesizing Open-Domain Dialogues in Several Languages With No Examples in Targets and No Machine Translation](https://aclanthology.org/2025.sigdial-1.55/) (Njifenjou et al., SIGDIAL 2025)
ACL