@inproceedings{talukdar-etal-2024-synthetic,
title = "Synthetic Data and Model Dynamics based Performance Analysis for {A}ssamese-{B}odo Low Resource {NMT}",
author = "Talukdar, Kuwali and
Sarma, Shikhar Kumar and
Kashyap, Kishore",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.20/",
pages = "178--187",
abstract = "This paper presents details of modelling and performance analysis of Neural Machine Translation (NMT) for the low-resource Assamese-Bodo language pair, focusing on model tuning and the use of synthetic data. Given the scarcity of parallel corpora for these languages, synthetic data generation techniques, such as back-translation, were employed to enhance translation performance. The NMT architecture was used along with necessary preprocessing steps as per the NMT pipeline. Experimentation across varying model parameters have been performed and scores are recorded. The model`s performance was evaluated using the BLEU score, which showed significant improvement when synthetic data was incorporated into the training process. While a base model with gold standard data of relatively smaller size yielded Overall BLEU of 11.35, optimized tuned model with synthetic data has resulted considerable improvement in BLEU scores across the domains, with overall BLEU upto 14.74. Challenges related to data scarcity and model optimization are also discussed, along with potential future improvements."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="talukdar-etal-2024-synthetic">
<titleInfo>
<title>Synthetic Data and Model Dynamics based Performance Analysis for Assamese-Bodo Low Resource NMT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kuwali</namePart>
<namePart type="family">Talukdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikhar</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Sarma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kishore</namePart>
<namePart type="family">Kashyap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents details of modelling and performance analysis of Neural Machine Translation (NMT) for the low-resource Assamese-Bodo language pair, focusing on model tuning and the use of synthetic data. Given the scarcity of parallel corpora for these languages, synthetic data generation techniques, such as back-translation, were employed to enhance translation performance. The NMT architecture was used along with necessary preprocessing steps as per the NMT pipeline. Experimentation across varying model parameters have been performed and scores are recorded. The model‘s performance was evaluated using the BLEU score, which showed significant improvement when synthetic data was incorporated into the training process. While a base model with gold standard data of relatively smaller size yielded Overall BLEU of 11.35, optimized tuned model with synthetic data has resulted considerable improvement in BLEU scores across the domains, with overall BLEU upto 14.74. Challenges related to data scarcity and model optimization are also discussed, along with potential future improvements.</abstract>
<identifier type="citekey">talukdar-etal-2024-synthetic</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.20/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>178</start>
<end>187</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Data and Model Dynamics based Performance Analysis for Assamese-Bodo Low Resource NMT
%A Talukdar, Kuwali
%A Sarma, Shikhar Kumar
%A Kashyap, Kishore
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F talukdar-etal-2024-synthetic
%X This paper presents details of modelling and performance analysis of Neural Machine Translation (NMT) for the low-resource Assamese-Bodo language pair, focusing on model tuning and the use of synthetic data. Given the scarcity of parallel corpora for these languages, synthetic data generation techniques, such as back-translation, were employed to enhance translation performance. The NMT architecture was used along with necessary preprocessing steps as per the NMT pipeline. Experimentation across varying model parameters have been performed and scores are recorded. The model‘s performance was evaluated using the BLEU score, which showed significant improvement when synthetic data was incorporated into the training process. While a base model with gold standard data of relatively smaller size yielded Overall BLEU of 11.35, optimized tuned model with synthetic data has resulted considerable improvement in BLEU scores across the domains, with overall BLEU upto 14.74. Challenges related to data scarcity and model optimization are also discussed, along with potential future improvements.
%U https://aclanthology.org/2024.icon-1.20/
%P 178-187
Markdown (Informal)
[Synthetic Data and Model Dynamics based Performance Analysis for Assamese-Bodo Low Resource NMT](https://aclanthology.org/2024.icon-1.20/) (Talukdar et al., ICON 2024)
ACL