@inproceedings{tan-etal-2025-naist,
title = "{NAIST} Simultaneous Speech Translation System for {IWSLT} 2025",
author = "Tan, Haotian and
Faradishi Widiaputri, Ruhiyah and
Meyer Saragih, Jan and
Ko, Yuka and
Sudoh, Katsuhito and
Nakamura, Satoshi and
Sakti, Sakriani",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.39/",
doi = "10.18653/v1/2025.iwslt-1.39",
pages = "369--378",
ISBN = "979-8-89176-272-5",
abstract = "This paper describes the NAIST submission to the English-to-German, Japanese, Chinese Simultaneous Speech-to-Text track at IWSLT 2025. Last year, our system was based on an end-to-end speech-to-text translation model that combined HuBERT and mBART. This year, the system consists of a Whisper encoder, the DeCo compressive projector, and the Qwen large language model. The simultaneous translation (SimulST) system is implemented by applying a local agreement policy to an offline-trained translation model. For the streaming translation (StreamST) system, we integrate an online version of the SHAS segmenter into our SimulST architecture. Our results demonstrate that adopting LLMs as the backbone architecture for speech translation tasks yields strong translation performance. Additionally, leveraging robust segmentation capability of SHAS for StreamST achieves good quality-latency trade-off when processing unbounded audio streams."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tan-etal-2025-naist">
<titleInfo>
<title>NAIST Simultaneous Speech Translation System for IWSLT 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haotian</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruhiyah</namePart>
<namePart type="family">Faradishi Widiaputri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Meyer Saragih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuka</namePart>
<namePart type="family">Ko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katsuhito</namePart>
<namePart type="family">Sudoh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satoshi</namePart>
<namePart type="family">Nakamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>This paper describes the NAIST submission to the English-to-German, Japanese, Chinese Simultaneous Speech-to-Text track at IWSLT 2025. Last year, our system was based on an end-to-end speech-to-text translation model that combined HuBERT and mBART. This year, the system consists of a Whisper encoder, the DeCo compressive projector, and the Qwen large language model. The simultaneous translation (SimulST) system is implemented by applying a local agreement policy to an offline-trained translation model. For the streaming translation (StreamST) system, we integrate an online version of the SHAS segmenter into our SimulST architecture. Our results demonstrate that adopting LLMs as the backbone architecture for speech translation tasks yields strong translation performance. Additionally, leveraging robust segmentation capability of SHAS for StreamST achieves good quality-latency trade-off when processing unbounded audio streams.</abstract>
<identifier type="citekey">tan-etal-2025-naist</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.39</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.39/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>369</start>
<end>378</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NAIST Simultaneous Speech Translation System for IWSLT 2025
%A Tan, Haotian
%A Faradishi Widiaputri, Ruhiyah
%A Meyer Saragih, Jan
%A Ko, Yuka
%A Sudoh, Katsuhito
%A Nakamura, Satoshi
%A Sakti, Sakriani
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F tan-etal-2025-naist
%X This paper describes the NAIST submission to the English-to-German, Japanese, Chinese Simultaneous Speech-to-Text track at IWSLT 2025. Last year, our system was based on an end-to-end speech-to-text translation model that combined HuBERT and mBART. This year, the system consists of a Whisper encoder, the DeCo compressive projector, and the Qwen large language model. The simultaneous translation (SimulST) system is implemented by applying a local agreement policy to an offline-trained translation model. For the streaming translation (StreamST) system, we integrate an online version of the SHAS segmenter into our SimulST architecture. Our results demonstrate that adopting LLMs as the backbone architecture for speech translation tasks yields strong translation performance. Additionally, leveraging robust segmentation capability of SHAS for StreamST achieves good quality-latency trade-off when processing unbounded audio streams.
%R 10.18653/v1/2025.iwslt-1.39
%U https://aclanthology.org/2025.iwslt-1.39/
%U https://doi.org/10.18653/v1/2025.iwslt-1.39
%P 369-378
Markdown (Informal)
[NAIST Simultaneous Speech Translation System for IWSLT 2025](https://aclanthology.org/2025.iwslt-1.39/) (Tan et al., IWSLT 2025)
ACL
- Haotian Tan, Ruhiyah Faradishi Widiaputri, Jan Meyer Saragih, Yuka Ko, Katsuhito Sudoh, Satoshi Nakamura, and Sakriani Sakti. 2025. NAIST Simultaneous Speech Translation System for IWSLT 2025. In Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025), pages 369–378, Vienna, Austria (in-person and online). Association for Computational Linguistics.