@inproceedings{faradishi-widiaputri-etal-2025-naist,
title = "{NAIST} Offline Speech Translation System for {IWSLT} 2025",
author = "Faradishi Widiaputri, Ruhiyah and
Tan, Haotian and
Meyer Saragih, Jan and
Ko, Yuka and
Sudoh, Katsuhito and
Nakamura, Satoshi and
Sakti, Sakriani",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.38/",
doi = "10.18653/v1/2025.iwslt-1.38",
pages = "360--368",
ISBN = "979-8-89176-272-5",
abstract = "This paper presents NAIST{'}s submission to the offline speech translation task of the IWSLT 2025 evaluation campaign, focusing on English-to-German and English-to-Chinese translation. We implemented both cascade and end-to-end frameworks using various components. For the cascade approach, we used Whisper and SALMONN as automatic speech recognition systems, each paired with Qwen2.5 large language model (LLM) for translation. In the end-to-end setting, we used SALMONN as speech translation and also built a custom model combining the Whisper encoder, DeCo projector, and Qwen2.5 LLM. To further leverage the large language model capabilities, we experimented with different prompting strategies. Additionally, since long speech inputs are segmented for processing, we applied hypothesis combination techniques to generate the final translation output. Our results show that combining Whisper and LLMs can yield strong translation performance, even without further fine-tuning in the cascade setup. Moreover, our proposed end-to-end architecture achieved competitive results, despite being trained on significantly less data compared to SALMONN. Finally, we decided to use both SALMONN as an end-to-end speech translation model and our proposed end-to-end model for our IWSLT 2025 submission for both language pairs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="faradishi-widiaputri-etal-2025-naist">
<titleInfo>
<title>NAIST Offline Speech Translation System for IWSLT 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruhiyah</namePart>
<namePart type="family">Faradishi Widiaputri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haotian</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Meyer Saragih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuka</namePart>
<namePart type="family">Ko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katsuhito</namePart>
<namePart type="family">Sudoh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satoshi</namePart>
<namePart type="family">Nakamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>This paper presents NAIST’s submission to the offline speech translation task of the IWSLT 2025 evaluation campaign, focusing on English-to-German and English-to-Chinese translation. We implemented both cascade and end-to-end frameworks using various components. For the cascade approach, we used Whisper and SALMONN as automatic speech recognition systems, each paired with Qwen2.5 large language model (LLM) for translation. In the end-to-end setting, we used SALMONN as speech translation and also built a custom model combining the Whisper encoder, DeCo projector, and Qwen2.5 LLM. To further leverage the large language model capabilities, we experimented with different prompting strategies. Additionally, since long speech inputs are segmented for processing, we applied hypothesis combination techniques to generate the final translation output. Our results show that combining Whisper and LLMs can yield strong translation performance, even without further fine-tuning in the cascade setup. Moreover, our proposed end-to-end architecture achieved competitive results, despite being trained on significantly less data compared to SALMONN. Finally, we decided to use both SALMONN as an end-to-end speech translation model and our proposed end-to-end model for our IWSLT 2025 submission for both language pairs.</abstract>
<identifier type="citekey">faradishi-widiaputri-etal-2025-naist</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.38</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.38/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>360</start>
<end>368</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NAIST Offline Speech Translation System for IWSLT 2025
%A Faradishi Widiaputri, Ruhiyah
%A Tan, Haotian
%A Meyer Saragih, Jan
%A Ko, Yuka
%A Sudoh, Katsuhito
%A Nakamura, Satoshi
%A Sakti, Sakriani
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F faradishi-widiaputri-etal-2025-naist
%X This paper presents NAIST’s submission to the offline speech translation task of the IWSLT 2025 evaluation campaign, focusing on English-to-German and English-to-Chinese translation. We implemented both cascade and end-to-end frameworks using various components. For the cascade approach, we used Whisper and SALMONN as automatic speech recognition systems, each paired with Qwen2.5 large language model (LLM) for translation. In the end-to-end setting, we used SALMONN as speech translation and also built a custom model combining the Whisper encoder, DeCo projector, and Qwen2.5 LLM. To further leverage the large language model capabilities, we experimented with different prompting strategies. Additionally, since long speech inputs are segmented for processing, we applied hypothesis combination techniques to generate the final translation output. Our results show that combining Whisper and LLMs can yield strong translation performance, even without further fine-tuning in the cascade setup. Moreover, our proposed end-to-end architecture achieved competitive results, despite being trained on significantly less data compared to SALMONN. Finally, we decided to use both SALMONN as an end-to-end speech translation model and our proposed end-to-end model for our IWSLT 2025 submission for both language pairs.
%R 10.18653/v1/2025.iwslt-1.38
%U https://aclanthology.org/2025.iwslt-1.38/
%U https://doi.org/10.18653/v1/2025.iwslt-1.38
%P 360-368
Markdown (Informal)
[NAIST Offline Speech Translation System for IWSLT 2025](https://aclanthology.org/2025.iwslt-1.38/) (Faradishi Widiaputri et al., IWSLT 2025)
ACL
- Ruhiyah Faradishi Widiaputri, Haotian Tan, Jan Meyer Saragih, Yuka Ko, Katsuhito Sudoh, Satoshi Nakamura, and Sakriani Sakti. 2025. NAIST Offline Speech Translation System for IWSLT 2025. In Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025), pages 360–368, Vienna, Austria (in-person and online). Association for Computational Linguistics.