@inproceedings{kunchukuttan-etal-2025-data,
title = "Data and Model Centric Approaches for Expansion of Large Language Models to New languages",
author = "Kunchukuttan, Anoop and
Dabre, Raj and
Murthy, Rudra and
Khan, Mohammed Safi Ur Rahman and
Jayakumar, Thanmay",
editor = "Pyatkin, Valentina and
Vlachos, Andreas",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Tutorial Abstracts",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-tutorials.5/",
pages = "12--13",
ISBN = "979-8-89176-336-4",
abstract = "Despite the increasing pace of Large Language Model (LLM) research, a vast majority of existing LLMs mainly support English alongside a handful of high resource languages, leaving a major gap for most low-resource languages. In this tutorial, we focus on approaches to expand the language coverage of LLMs. This provides an efficient and viable path to bring LLM technologies to low-resource languages, instead of training from scratch. We look at approaches at various stages of the LLM training pipeline, like tokenizer training, pre-training, instruction tuning, alignment, evaluation, etc., where adaptations are made to support new languages. We look at data-oriented approaches as well as model-oriented approaches. We hope that our tutorial enables researchers and practitioners to work on incorporating additional languages and tasks into existing LLMs to enhance inclusivity and coverage."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kunchukuttan-etal-2025-data">
<titleInfo>
<title>Data and Model Centric Approaches for Expansion of Large Language Models to New languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kunchukuttan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="family">Dabre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudra</namePart>
<namePart type="family">Murthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Safi</namePart>
<namePart type="given">Ur</namePart>
<namePart type="given">Rahman</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thanmay</namePart>
<namePart type="family">Jayakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Tutorial Abstracts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Valentina</namePart>
<namePart type="family">Pyatkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-336-4</identifier>
</relatedItem>
<abstract>Despite the increasing pace of Large Language Model (LLM) research, a vast majority of existing LLMs mainly support English alongside a handful of high resource languages, leaving a major gap for most low-resource languages. In this tutorial, we focus on approaches to expand the language coverage of LLMs. This provides an efficient and viable path to bring LLM technologies to low-resource languages, instead of training from scratch. We look at approaches at various stages of the LLM training pipeline, like tokenizer training, pre-training, instruction tuning, alignment, evaluation, etc., where adaptations are made to support new languages. We look at data-oriented approaches as well as model-oriented approaches. We hope that our tutorial enables researchers and practitioners to work on incorporating additional languages and tasks into existing LLMs to enhance inclusivity and coverage.</abstract>
<identifier type="citekey">kunchukuttan-etal-2025-data</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-tutorials.5/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>12</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data and Model Centric Approaches for Expansion of Large Language Models to New languages
%A Kunchukuttan, Anoop
%A Dabre, Raj
%A Murthy, Rudra
%A Khan, Mohammed Safi Ur Rahman
%A Jayakumar, Thanmay
%Y Pyatkin, Valentina
%Y Vlachos, Andreas
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Tutorial Abstracts
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-336-4
%F kunchukuttan-etal-2025-data
%X Despite the increasing pace of Large Language Model (LLM) research, a vast majority of existing LLMs mainly support English alongside a handful of high resource languages, leaving a major gap for most low-resource languages. In this tutorial, we focus on approaches to expand the language coverage of LLMs. This provides an efficient and viable path to bring LLM technologies to low-resource languages, instead of training from scratch. We look at approaches at various stages of the LLM training pipeline, like tokenizer training, pre-training, instruction tuning, alignment, evaluation, etc., where adaptations are made to support new languages. We look at data-oriented approaches as well as model-oriented approaches. We hope that our tutorial enables researchers and practitioners to work on incorporating additional languages and tasks into existing LLMs to enhance inclusivity and coverage.
%U https://aclanthology.org/2025.emnlp-tutorials.5/
%P 12-13
Markdown (Informal)
[Data and Model Centric Approaches for Expansion of Large Language Models to New languages](https://aclanthology.org/2025.emnlp-tutorials.5/) (Kunchukuttan et al., EMNLP 2025)
ACL