@inproceedings{narayanan-aepli-2024-tulu,
title = "A {T}ulu Resource for Machine Translation",
author = {Narayanan, Manu and
Aepli, No{\"e}mi},
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.155",
pages = "1756--1767",
abstract = "We present the first parallel dataset for English{--}Tulu translation. Tulu, classified within the South Dravidian linguistic family branch, is predominantly spoken by approximately 2.5 million individuals in southwestern India. Our dataset is constructed by integrating human translations into the multilingual machine translation resource FLORES-200. Furthermore, we use this dataset for evaluation purposes in developing our English{--}Tulu machine translation model. For the model{'}s training, we leverage resources available for related South Dravidian languages. We adopt a transfer learning approach that exploits similarities between high-resource and low-resource languages. This method enables the training of a machine translation system even in the absence of parallel data between the source and target language, thereby overcoming a significant obstacle in machine translation development for low-resource languages. Our English{--}Tulu system, trained without using parallel English{--}Tulu data, outperforms Google Translate by 19 BLEU points (in September 2023). The dataset and code are available here: https://github.com/manunarayanan/Tulu-NMT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="narayanan-aepli-2024-tulu">
<titleInfo>
<title>A Tulu Resource for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manu</namePart>
<namePart type="family">Narayanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noëmi</namePart>
<namePart type="family">Aepli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the first parallel dataset for English–Tulu translation. Tulu, classified within the South Dravidian linguistic family branch, is predominantly spoken by approximately 2.5 million individuals in southwestern India. Our dataset is constructed by integrating human translations into the multilingual machine translation resource FLORES-200. Furthermore, we use this dataset for evaluation purposes in developing our English–Tulu machine translation model. For the model’s training, we leverage resources available for related South Dravidian languages. We adopt a transfer learning approach that exploits similarities between high-resource and low-resource languages. This method enables the training of a machine translation system even in the absence of parallel data between the source and target language, thereby overcoming a significant obstacle in machine translation development for low-resource languages. Our English–Tulu system, trained without using parallel English–Tulu data, outperforms Google Translate by 19 BLEU points (in September 2023). The dataset and code are available here: https://github.com/manunarayanan/Tulu-NMT.</abstract>
<identifier type="citekey">narayanan-aepli-2024-tulu</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.155</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1756</start>
<end>1767</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Tulu Resource for Machine Translation
%A Narayanan, Manu
%A Aepli, Noëmi
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F narayanan-aepli-2024-tulu
%X We present the first parallel dataset for English–Tulu translation. Tulu, classified within the South Dravidian linguistic family branch, is predominantly spoken by approximately 2.5 million individuals in southwestern India. Our dataset is constructed by integrating human translations into the multilingual machine translation resource FLORES-200. Furthermore, we use this dataset for evaluation purposes in developing our English–Tulu machine translation model. For the model’s training, we leverage resources available for related South Dravidian languages. We adopt a transfer learning approach that exploits similarities between high-resource and low-resource languages. This method enables the training of a machine translation system even in the absence of parallel data between the source and target language, thereby overcoming a significant obstacle in machine translation development for low-resource languages. Our English–Tulu system, trained without using parallel English–Tulu data, outperforms Google Translate by 19 BLEU points (in September 2023). The dataset and code are available here: https://github.com/manunarayanan/Tulu-NMT.
%U https://aclanthology.org/2024.lrec-main.155
%P 1756-1767
Markdown (Informal)
[A Tulu Resource for Machine Translation](https://aclanthology.org/2024.lrec-main.155) (Narayanan & Aepli, LREC-COLING 2024)
ACL
- Manu Narayanan and Noëmi Aepli. 2024. A Tulu Resource for Machine Translation. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 1756–1767, Torino, Italia. ELRA and ICCL.