@inproceedings{paul-etal-2025-aligning,
title = "Aligning Large Language Models to Low-Resource Languages through {LLM}-Based Selective Translation: A Systematic Study",
author = "Paul, Rakesh and
Kamath, Anusha and
Singla, Kanishk and
Joshi, Raviraj and
Vaidya, Utkarsh and
Chauhan, Sanjay Singh and
Wartikar, Niranjan",
editor = "Bhattacharya, Arnab and
Goyal, Pawan and
Ghosh, Saptarshi and
Ghosh, Kripabandhu",
booktitle = "Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bhasha-1.6/",
pages = "69--82",
ISBN = "979-8-89176-313-5",
abstract = "Multilingual large language models (LLMs) often demonstrate a performance gap between English and non-English languages, particularly in low-resource settings. Aligning these models to low-resource languages is essential yet challenging due to limited high-quality data. While English alignment datasets are readily available, curating equivalent data in other languages is expensive and time-consuming. A common workaround is to translate existing English alignment data; however, standard translation techniques often fail to preserve critical elements such as code, mathematical expressions, and structured formats like JSON. In this work, we investigate LLM-based selective translation, a technique that selectively translates only the translatable parts of a text while preserving non-translatable content and sentence structure. We conduct a systematic study to explore key questions around this approach, including its effectiveness compared to vanilla translation, the importance of filtering noisy outputs, and the benefits of mixing translated samples with original English data during alignment. Our experiments focus on the low-resource Indic language Hindi and compare translations generated by Google Cloud Platform (GCP) and Llama-3.1-405B. The results highlight the promise of selective translation as a practical and effective method for improving multilingual alignment in LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="paul-etal-2025-aligning">
<titleInfo>
<title>Aligning Large Language Models to Low-Resource Languages through LLM-Based Selective Translation: A Systematic Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rakesh</namePart>
<namePart type="family">Paul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anusha</namePart>
<namePart type="family">Kamath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kanishk</namePart>
<namePart type="family">Singla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raviraj</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Utkarsh</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjay</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Chauhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="family">Wartikar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arnab</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawan</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saptarshi</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kripabandhu</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-313-5</identifier>
</relatedItem>
<abstract>Multilingual large language models (LLMs) often demonstrate a performance gap between English and non-English languages, particularly in low-resource settings. Aligning these models to low-resource languages is essential yet challenging due to limited high-quality data. While English alignment datasets are readily available, curating equivalent data in other languages is expensive and time-consuming. A common workaround is to translate existing English alignment data; however, standard translation techniques often fail to preserve critical elements such as code, mathematical expressions, and structured formats like JSON. In this work, we investigate LLM-based selective translation, a technique that selectively translates only the translatable parts of a text while preserving non-translatable content and sentence structure. We conduct a systematic study to explore key questions around this approach, including its effectiveness compared to vanilla translation, the importance of filtering noisy outputs, and the benefits of mixing translated samples with original English data during alignment. Our experiments focus on the low-resource Indic language Hindi and compare translations generated by Google Cloud Platform (GCP) and Llama-3.1-405B. The results highlight the promise of selective translation as a practical and effective method for improving multilingual alignment in LLMs.</abstract>
<identifier type="citekey">paul-etal-2025-aligning</identifier>
<location>
<url>https://aclanthology.org/2025.bhasha-1.6/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>69</start>
<end>82</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Aligning Large Language Models to Low-Resource Languages through LLM-Based Selective Translation: A Systematic Study
%A Paul, Rakesh
%A Kamath, Anusha
%A Singla, Kanishk
%A Joshi, Raviraj
%A Vaidya, Utkarsh
%A Chauhan, Sanjay Singh
%A Wartikar, Niranjan
%Y Bhattacharya, Arnab
%Y Goyal, Pawan
%Y Ghosh, Saptarshi
%Y Ghosh, Kripabandhu
%S Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-313-5
%F paul-etal-2025-aligning
%X Multilingual large language models (LLMs) often demonstrate a performance gap between English and non-English languages, particularly in low-resource settings. Aligning these models to low-resource languages is essential yet challenging due to limited high-quality data. While English alignment datasets are readily available, curating equivalent data in other languages is expensive and time-consuming. A common workaround is to translate existing English alignment data; however, standard translation techniques often fail to preserve critical elements such as code, mathematical expressions, and structured formats like JSON. In this work, we investigate LLM-based selective translation, a technique that selectively translates only the translatable parts of a text while preserving non-translatable content and sentence structure. We conduct a systematic study to explore key questions around this approach, including its effectiveness compared to vanilla translation, the importance of filtering noisy outputs, and the benefits of mixing translated samples with original English data during alignment. Our experiments focus on the low-resource Indic language Hindi and compare translations generated by Google Cloud Platform (GCP) and Llama-3.1-405B. The results highlight the promise of selective translation as a practical and effective method for improving multilingual alignment in LLMs.
%U https://aclanthology.org/2025.bhasha-1.6/
%P 69-82
Markdown (Informal)
[Aligning Large Language Models to Low-Resource Languages through LLM-Based Selective Translation: A Systematic Study](https://aclanthology.org/2025.bhasha-1.6/) (Paul et al., BHASHA 2025)
ACL
- Rakesh Paul, Anusha Kamath, Kanishk Singla, Raviraj Joshi, Utkarsh Vaidya, Sanjay Singh Chauhan, and Niranjan Wartikar. 2025. Aligning Large Language Models to Low-Resource Languages through LLM-Based Selective Translation: A Systematic Study. In Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025), pages 69–82, Mumbai, India. Association for Computational Linguistics.