@inproceedings{jamaluddin-2026-thesis,
title = "Thesis Proposal: Development of End-to-End Speech Translation Models for {I}ndian Languages",
author = "Jamaluddin",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.41/",
pages = "535--543",
ISBN = "979-8-89176-383-8",
abstract = "Indian languages represent a highly multilingual and low-resource speech ecosystem, where the scarcity of high-quality parallel speech corpora significantly limits the development of speech-to-speech translation systems. Most existing approaches rely on cascaded pipelines that combine automatic speech recognition (ASR), machine translation (MT), and text-to-speech synthesis (TTS). While effective, these cascaded systems often suffer from cumulative error propagation, increased latency, and higher computational complexity, particularly in low-resource Indian languages. To address these challenges, my doctoral work proposes a novel sequence-to-sequence direct speech translation framework capable of translating speech from one Indian language to another without relying on intermediate text representations. Recent advances in deep learning, however, indicate that direct speech translation architectures can surpass conventional cascaded systems in both efficiency and translation quality, motivating the design of our fully end-to-end solution. We aim to release an initial dataset comprising at least 120,000 real speech samples within a 6{--}12 month timeframe."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jamaluddin-2026-thesis">
<titleInfo>
<title>Thesis Proposal: Development of End-to-End Speech Translation Models for Indian Languages</title>
</titleInfo>
<name>
<namePart>Jamaluddin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>Indian languages represent a highly multilingual and low-resource speech ecosystem, where the scarcity of high-quality parallel speech corpora significantly limits the development of speech-to-speech translation systems. Most existing approaches rely on cascaded pipelines that combine automatic speech recognition (ASR), machine translation (MT), and text-to-speech synthesis (TTS). While effective, these cascaded systems often suffer from cumulative error propagation, increased latency, and higher computational complexity, particularly in low-resource Indian languages. To address these challenges, my doctoral work proposes a novel sequence-to-sequence direct speech translation framework capable of translating speech from one Indian language to another without relying on intermediate text representations. Recent advances in deep learning, however, indicate that direct speech translation architectures can surpass conventional cascaded systems in both efficiency and translation quality, motivating the design of our fully end-to-end solution. We aim to release an initial dataset comprising at least 120,000 real speech samples within a 6–12 month timeframe.</abstract>
<identifier type="citekey">jamaluddin-2026-thesis</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.41/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>535</start>
<end>543</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Thesis Proposal: Development of End-to-End Speech Translation Models for Indian Languages
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%A Jamaluddin
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F jamaluddin-2026-thesis
%X Indian languages represent a highly multilingual and low-resource speech ecosystem, where the scarcity of high-quality parallel speech corpora significantly limits the development of speech-to-speech translation systems. Most existing approaches rely on cascaded pipelines that combine automatic speech recognition (ASR), machine translation (MT), and text-to-speech synthesis (TTS). While effective, these cascaded systems often suffer from cumulative error propagation, increased latency, and higher computational complexity, particularly in low-resource Indian languages. To address these challenges, my doctoral work proposes a novel sequence-to-sequence direct speech translation framework capable of translating speech from one Indian language to another without relying on intermediate text representations. Recent advances in deep learning, however, indicate that direct speech translation architectures can surpass conventional cascaded systems in both efficiency and translation quality, motivating the design of our fully end-to-end solution. We aim to release an initial dataset comprising at least 120,000 real speech samples within a 6–12 month timeframe.
%U https://aclanthology.org/2026.eacl-srw.41/
%P 535-543
Markdown (Informal)
[Thesis Proposal: Development of End-to-End Speech Translation Models for Indian Languages](https://aclanthology.org/2026.eacl-srw.41/) (Jamaluddin, EACL 2026)
ACL