@inproceedings{maheshwary-etal-2025-m2lingual,
title = "{M}2{L}ingual: Enhancing Multilingual, Multi-Turn Instruction Alignment in Large Language Models",
author = "Maheshwary, Rishabh and
Yadav, Vikas and
Nguyen, Hoang H and
Mahajan, Khyati and
Madhusudhan, Sathwik Tejaswi",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.489/",
doi = "10.18653/v1/2025.naacl-long.489",
pages = "9676--9713",
ISBN = "979-8-89176-189-6",
abstract = "Collecting instruction fine-tuning (IFT) data is a resource and time intensive task especially in multilingual setting where finding proficient native speakers is challenging. Moreover, traditional data collection is prone to privacy risks, toxicity and lacks scalability. While, fully synthetic datasets are a promising alternative, research on their use in multilingual domain is limited as existing approaches still rely on machine translation to improve multilingual performance. To bridge this gap we introduce M2Lingual, the first fully synthetic, multi-turn multilingual dataset having 175K conversations across 70 languages with a balanced mix of high, low and mid-resourced languages. M2Lingual is constructed using a cost-efficient and scalable method that uses our novel two-step Evol prompt taxonomy to transform a small set of human written instructions to complex and challenging conversations. Results across three model families, six baseline datasets and evaluation spanning 31 languages demonstrates the effectiveness of M2Lingual over other datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maheshwary-etal-2025-m2lingual">
<titleInfo>
<title>M2Lingual: Enhancing Multilingual, Multi-Turn Instruction Alignment in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rishabh</namePart>
<namePart type="family">Maheshwary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vikas</namePart>
<namePart type="family">Yadav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hoang</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyati</namePart>
<namePart type="family">Mahajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sathwik</namePart>
<namePart type="given">Tejaswi</namePart>
<namePart type="family">Madhusudhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Collecting instruction fine-tuning (IFT) data is a resource and time intensive task especially in multilingual setting where finding proficient native speakers is challenging. Moreover, traditional data collection is prone to privacy risks, toxicity and lacks scalability. While, fully synthetic datasets are a promising alternative, research on their use in multilingual domain is limited as existing approaches still rely on machine translation to improve multilingual performance. To bridge this gap we introduce M2Lingual, the first fully synthetic, multi-turn multilingual dataset having 175K conversations across 70 languages with a balanced mix of high, low and mid-resourced languages. M2Lingual is constructed using a cost-efficient and scalable method that uses our novel two-step Evol prompt taxonomy to transform a small set of human written instructions to complex and challenging conversations. Results across three model families, six baseline datasets and evaluation spanning 31 languages demonstrates the effectiveness of M2Lingual over other datasets.</abstract>
<identifier type="citekey">maheshwary-etal-2025-m2lingual</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.489</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.489/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>9676</start>
<end>9713</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T M2Lingual: Enhancing Multilingual, Multi-Turn Instruction Alignment in Large Language Models
%A Maheshwary, Rishabh
%A Yadav, Vikas
%A Nguyen, Hoang H.
%A Mahajan, Khyati
%A Madhusudhan, Sathwik Tejaswi
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F maheshwary-etal-2025-m2lingual
%X Collecting instruction fine-tuning (IFT) data is a resource and time intensive task especially in multilingual setting where finding proficient native speakers is challenging. Moreover, traditional data collection is prone to privacy risks, toxicity and lacks scalability. While, fully synthetic datasets are a promising alternative, research on their use in multilingual domain is limited as existing approaches still rely on machine translation to improve multilingual performance. To bridge this gap we introduce M2Lingual, the first fully synthetic, multi-turn multilingual dataset having 175K conversations across 70 languages with a balanced mix of high, low and mid-resourced languages. M2Lingual is constructed using a cost-efficient and scalable method that uses our novel two-step Evol prompt taxonomy to transform a small set of human written instructions to complex and challenging conversations. Results across three model families, six baseline datasets and evaluation spanning 31 languages demonstrates the effectiveness of M2Lingual over other datasets.
%R 10.18653/v1/2025.naacl-long.489
%U https://aclanthology.org/2025.naacl-long.489/
%U https://doi.org/10.18653/v1/2025.naacl-long.489
%P 9676-9713
Markdown (Informal)
[M2Lingual: Enhancing Multilingual, Multi-Turn Instruction Alignment in Large Language Models](https://aclanthology.org/2025.naacl-long.489/) (Maheshwary et al., NAACL 2025)
ACL
- Rishabh Maheshwary, Vikas Yadav, Hoang H Nguyen, Khyati Mahajan, and Sathwik Tejaswi Madhusudhan. 2025. M2Lingual: Enhancing Multilingual, Multi-Turn Instruction Alignment in Large Language Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 9676–9713, Albuquerque, New Mexico. Association for Computational Linguistics.