@inproceedings{das-etal-2025-investigating,
title = "Investigating the Effect of Backtranslation for {I}ndic Languages",
author = "Das, Sudhansu Bala and
Choudhury, Samujjal and
Mishra, Dr Tapas Kumar and
Patra, Dr Bidyut Kr",
editor = "Weerasinghe, Ruvan and
Anuradha, Isuri and
Sumanathilaka, Deshan",
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.indonlp-1.18/",
pages = "152--165",
abstract = "Neural machine translation (NMT) is becoming increasingly popular as an effective method of automated language translation. However, due to a scarcity of training datasets, its effectiveness is limited when used with low-resource languages, such as Indian Languages (ILs). The lack of parallel datasets in Natural Language Processing (NLP) makes it difficult to investigate many ILs for Machine Translation (MT). A data augmentation approach such as Backtranslation (BT) can be used to enhance the size of the training dataset. This paper presents the development of a NMT model for ILs within the context of a MT system. To address the issue of data scarcity, the paper examines the effectiveness of a BT approach for ILs that uses both monolingual and parallel datasets. Experimental results reveal that while the BT has improved the model`s performance, however, it is not as significant as expected. It has also been observed that, even though the English-ILs and ILs-English models are trained on the same dataset, the ILs-English models perform better in all evaluation metrics. The reason for this is that ILs frequently differ in sentence structure, word order, and morphological richness from English. The paper also includes error analysis for translations between languages that were utilized in experiments utilizing the Multidimensional Quality Metrics (MQM) framework."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="das-etal-2025-investigating">
<titleInfo>
<title>Investigating the Effect of Backtranslation for Indic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sudhansu</namePart>
<namePart type="given">Bala</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samujjal</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Tapas</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Bidyut</namePart>
<namePart type="given">Kr</namePart>
<namePart type="family">Patra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruvan</namePart>
<namePart type="family">Weerasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isuri</namePart>
<namePart type="family">Anuradha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deshan</namePart>
<namePart type="family">Sumanathilaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural machine translation (NMT) is becoming increasingly popular as an effective method of automated language translation. However, due to a scarcity of training datasets, its effectiveness is limited when used with low-resource languages, such as Indian Languages (ILs). The lack of parallel datasets in Natural Language Processing (NLP) makes it difficult to investigate many ILs for Machine Translation (MT). A data augmentation approach such as Backtranslation (BT) can be used to enhance the size of the training dataset. This paper presents the development of a NMT model for ILs within the context of a MT system. To address the issue of data scarcity, the paper examines the effectiveness of a BT approach for ILs that uses both monolingual and parallel datasets. Experimental results reveal that while the BT has improved the model‘s performance, however, it is not as significant as expected. It has also been observed that, even though the English-ILs and ILs-English models are trained on the same dataset, the ILs-English models perform better in all evaluation metrics. The reason for this is that ILs frequently differ in sentence structure, word order, and morphological richness from English. The paper also includes error analysis for translations between languages that were utilized in experiments utilizing the Multidimensional Quality Metrics (MQM) framework.</abstract>
<identifier type="citekey">das-etal-2025-investigating</identifier>
<location>
<url>https://aclanthology.org/2025.indonlp-1.18/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>152</start>
<end>165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating the Effect of Backtranslation for Indic Languages
%A Das, Sudhansu Bala
%A Choudhury, Samujjal
%A Mishra, Dr Tapas Kumar
%A Patra, Dr Bidyut Kr
%Y Weerasinghe, Ruvan
%Y Anuradha, Isuri
%Y Sumanathilaka, Deshan
%S Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F das-etal-2025-investigating
%X Neural machine translation (NMT) is becoming increasingly popular as an effective method of automated language translation. However, due to a scarcity of training datasets, its effectiveness is limited when used with low-resource languages, such as Indian Languages (ILs). The lack of parallel datasets in Natural Language Processing (NLP) makes it difficult to investigate many ILs for Machine Translation (MT). A data augmentation approach such as Backtranslation (BT) can be used to enhance the size of the training dataset. This paper presents the development of a NMT model for ILs within the context of a MT system. To address the issue of data scarcity, the paper examines the effectiveness of a BT approach for ILs that uses both monolingual and parallel datasets. Experimental results reveal that while the BT has improved the model‘s performance, however, it is not as significant as expected. It has also been observed that, even though the English-ILs and ILs-English models are trained on the same dataset, the ILs-English models perform better in all evaluation metrics. The reason for this is that ILs frequently differ in sentence structure, word order, and morphological richness from English. The paper also includes error analysis for translations between languages that were utilized in experiments utilizing the Multidimensional Quality Metrics (MQM) framework.
%U https://aclanthology.org/2025.indonlp-1.18/
%P 152-165
Markdown (Informal)
[Investigating the Effect of Backtranslation for Indic Languages](https://aclanthology.org/2025.indonlp-1.18/) (Das et al., IndoNLP 2025)
ACL
- Sudhansu Bala Das, Samujjal Choudhury, Dr Tapas Kumar Mishra, and Dr Bidyut Kr Patra. 2025. Investigating the Effect of Backtranslation for Indic Languages. In Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages, pages 152–165, Abu Dhabi. Association for Computational Linguistics.