@inproceedings{wei-etal-2025-hitszs,
title = "{HITSZ}{'}s End-To-End Speech Translation Systems Combining Sequence-to-Sequence Auto Speech Recognition Model and {I}ndic Large Language Model for {IWSLT} 2025 in {I}ndic Track",
author = "Wei, Xuchen and
Wu, Yangxin and
Zhang, Yaoyin and
Liu, Henglyu and
Chen, Kehai and
Bai, Xuefeng and
Zhang, Min",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.43/",
doi = "10.18653/v1/2025.iwslt-1.43",
pages = "405--411",
ISBN = "979-8-89176-272-5",
abstract = "This paper presents HITSZ{'}s submission for the IWSLT 2025 Indic track, focusing on speech-to-text translation (ST) for English-to-Indic and Indic-to-English language pairs. To enhance translation quality in this low-resource scenario, we propose an end-to-end system integrating the pre-trained Whisper automated speech recognition (ASR) model with Krutrim, an Indic-specialized large language model (LLM). Experimental results demonstrate that our end-to-end system achieved average BLEU scores of 28.88 for English-to-Indic directions and 27.86 for Indic-to-English directions. Furthermore, we investigated the Chain-of-Thought (CoT) method. While this method showed potential for significant translation quality improvements on successfully parsed outputs (e.g. a 13.84 BLEU increase for Tamil-to-English), we observed challenges in ensuring the model consistently adheres to the required CoT output format."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wei-etal-2025-hitszs">
<titleInfo>
<title>HITSZ’s End-To-End Speech Translation Systems Combining Sequence-to-Sequence Auto Speech Recognition Model and Indic Large Language Model for IWSLT 2025 in Indic Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuchen</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangxin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaoyin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henglyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kehai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuefeng</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>This paper presents HITSZ’s submission for the IWSLT 2025 Indic track, focusing on speech-to-text translation (ST) for English-to-Indic and Indic-to-English language pairs. To enhance translation quality in this low-resource scenario, we propose an end-to-end system integrating the pre-trained Whisper automated speech recognition (ASR) model with Krutrim, an Indic-specialized large language model (LLM). Experimental results demonstrate that our end-to-end system achieved average BLEU scores of 28.88 for English-to-Indic directions and 27.86 for Indic-to-English directions. Furthermore, we investigated the Chain-of-Thought (CoT) method. While this method showed potential for significant translation quality improvements on successfully parsed outputs (e.g. a 13.84 BLEU increase for Tamil-to-English), we observed challenges in ensuring the model consistently adheres to the required CoT output format.</abstract>
<identifier type="citekey">wei-etal-2025-hitszs</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.43</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.43/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>405</start>
<end>411</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HITSZ’s End-To-End Speech Translation Systems Combining Sequence-to-Sequence Auto Speech Recognition Model and Indic Large Language Model for IWSLT 2025 in Indic Track
%A Wei, Xuchen
%A Wu, Yangxin
%A Zhang, Yaoyin
%A Liu, Henglyu
%A Chen, Kehai
%A Bai, Xuefeng
%A Zhang, Min
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F wei-etal-2025-hitszs
%X This paper presents HITSZ’s submission for the IWSLT 2025 Indic track, focusing on speech-to-text translation (ST) for English-to-Indic and Indic-to-English language pairs. To enhance translation quality in this low-resource scenario, we propose an end-to-end system integrating the pre-trained Whisper automated speech recognition (ASR) model with Krutrim, an Indic-specialized large language model (LLM). Experimental results demonstrate that our end-to-end system achieved average BLEU scores of 28.88 for English-to-Indic directions and 27.86 for Indic-to-English directions. Furthermore, we investigated the Chain-of-Thought (CoT) method. While this method showed potential for significant translation quality improvements on successfully parsed outputs (e.g. a 13.84 BLEU increase for Tamil-to-English), we observed challenges in ensuring the model consistently adheres to the required CoT output format.
%R 10.18653/v1/2025.iwslt-1.43
%U https://aclanthology.org/2025.iwslt-1.43/
%U https://doi.org/10.18653/v1/2025.iwslt-1.43
%P 405-411
Markdown (Informal)
[HITSZ’s End-To-End Speech Translation Systems Combining Sequence-to-Sequence Auto Speech Recognition Model and Indic Large Language Model for IWSLT 2025 in Indic Track](https://aclanthology.org/2025.iwslt-1.43/) (Wei et al., IWSLT 2025)
ACL