@inproceedings{asha-hosahalli-lakshmaiah-2023-kt2,
title = "{KT}2: {K}annada-{T}ulu Parallel Corpus Construction for Neural Machine Translation",
author = "Hegde, Asha and
Shashirekha, Hosahalli Lakshmaiah",
editor = "D. Pawar, Jyoti and
Lalitha Devi, Sobha",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.75",
pages = "743--753",
abstract = "In the last decade, Neural Machine Translation (NMT) has experienced substantial advances. However, its widespread success has revealed a limitation in terms of reduced proficiency when dealing with under-resourced language pairs, mainly due to the lack of parallel corpora in comparison to high-resourced language pairs like English-German, EnglishSpanish, and English-French. As a result, researchers have increasingly focused on implementing NMT techniques tailored to underresourced language pairs and thereby, the construction/collection of parallel corpora. In view of the scarcity of parallel corpus for underresourced languages, the strategies for building a Kannada-Tulu parallel corpus and baseline models for Machine Translation (MT) of Kannada-Tulu are described in this paper. Both Kannada and Tulu languages are under-resourced due to lack of processing tools and digital resources, especially parallel corpora, which are critical for MT development. Kannada-Tulu parallel corpus is constructed in two ways: i) Manual Translation and ii) Automatic Text Generation (ATG). Various encoderdecoder based NMT approaches, including Recurrent Neural Network (RNN), Bidirectional RNN (BiRNN), and transformer-based architectures, trained with Gated Recurrent Units (GRU) and Long Short Term Memory (LSTM) units, are explored as baseline models for Kannada to Tulu (Kan-Tul) and Tulu to Kannada (Kan-Tul) sentence-level translations. Additionally, the study explores sub-word tokenization techniques for Kannada-Tulu language pairs, and the performances of these NMT models are evaluated using Character n-gram Fscore (CHRF) and Bilingual Evaluation Understudy (BLEU) scores. Among the baselines, the transformer-based models outperformed other models with BLEU scores of 0.241 and 0.341 and CHRF scores of 0.502 and 0.598 for KanTul and Kan-Tul sentence-level translations, respectively.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="asha-hosahalli-lakshmaiah-2023-kt2">
<titleInfo>
<title>KT2: Kannada-Tulu Parallel Corpus Construction for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Asha</namePart>
<namePart type="family">Hegde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosahalli</namePart>
<namePart type="given">Lakshmaiah</namePart>
<namePart type="family">Shashirekha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jyoti</namePart>
<namePart type="family">D. Pawar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the last decade, Neural Machine Translation (NMT) has experienced substantial advances. However, its widespread success has revealed a limitation in terms of reduced proficiency when dealing with under-resourced language pairs, mainly due to the lack of parallel corpora in comparison to high-resourced language pairs like English-German, EnglishSpanish, and English-French. As a result, researchers have increasingly focused on implementing NMT techniques tailored to underresourced language pairs and thereby, the construction/collection of parallel corpora. In view of the scarcity of parallel corpus for underresourced languages, the strategies for building a Kannada-Tulu parallel corpus and baseline models for Machine Translation (MT) of Kannada-Tulu are described in this paper. Both Kannada and Tulu languages are under-resourced due to lack of processing tools and digital resources, especially parallel corpora, which are critical for MT development. Kannada-Tulu parallel corpus is constructed in two ways: i) Manual Translation and ii) Automatic Text Generation (ATG). Various encoderdecoder based NMT approaches, including Recurrent Neural Network (RNN), Bidirectional RNN (BiRNN), and transformer-based architectures, trained with Gated Recurrent Units (GRU) and Long Short Term Memory (LSTM) units, are explored as baseline models for Kannada to Tulu (Kan-Tul) and Tulu to Kannada (Kan-Tul) sentence-level translations. Additionally, the study explores sub-word tokenization techniques for Kannada-Tulu language pairs, and the performances of these NMT models are evaluated using Character n-gram Fscore (CHRF) and Bilingual Evaluation Understudy (BLEU) scores. Among the baselines, the transformer-based models outperformed other models with BLEU scores of 0.241 and 0.341 and CHRF scores of 0.502 and 0.598 for KanTul and Kan-Tul sentence-level translations, respectively.</abstract>
<identifier type="citekey">asha-hosahalli-lakshmaiah-2023-kt2</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.75</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>743</start>
<end>753</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KT2: Kannada-Tulu Parallel Corpus Construction for Neural Machine Translation
%A Hegde, Asha
%A Shashirekha, Hosahalli Lakshmaiah
%Y D. Pawar, Jyoti
%Y Lalitha Devi, Sobha
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F asha-hosahalli-lakshmaiah-2023-kt2
%X In the last decade, Neural Machine Translation (NMT) has experienced substantial advances. However, its widespread success has revealed a limitation in terms of reduced proficiency when dealing with under-resourced language pairs, mainly due to the lack of parallel corpora in comparison to high-resourced language pairs like English-German, EnglishSpanish, and English-French. As a result, researchers have increasingly focused on implementing NMT techniques tailored to underresourced language pairs and thereby, the construction/collection of parallel corpora. In view of the scarcity of parallel corpus for underresourced languages, the strategies for building a Kannada-Tulu parallel corpus and baseline models for Machine Translation (MT) of Kannada-Tulu are described in this paper. Both Kannada and Tulu languages are under-resourced due to lack of processing tools and digital resources, especially parallel corpora, which are critical for MT development. Kannada-Tulu parallel corpus is constructed in two ways: i) Manual Translation and ii) Automatic Text Generation (ATG). Various encoderdecoder based NMT approaches, including Recurrent Neural Network (RNN), Bidirectional RNN (BiRNN), and transformer-based architectures, trained with Gated Recurrent Units (GRU) and Long Short Term Memory (LSTM) units, are explored as baseline models for Kannada to Tulu (Kan-Tul) and Tulu to Kannada (Kan-Tul) sentence-level translations. Additionally, the study explores sub-word tokenization techniques for Kannada-Tulu language pairs, and the performances of these NMT models are evaluated using Character n-gram Fscore (CHRF) and Bilingual Evaluation Understudy (BLEU) scores. Among the baselines, the transformer-based models outperformed other models with BLEU scores of 0.241 and 0.341 and CHRF scores of 0.502 and 0.598 for KanTul and Kan-Tul sentence-level translations, respectively.
%U https://aclanthology.org/2023.icon-1.75
%P 743-753
Markdown (Informal)
[KT2: Kannada-Tulu Parallel Corpus Construction for Neural Machine Translation](https://aclanthology.org/2023.icon-1.75) (Hegde & Shashirekha, ICON 2023)
ACL