@inproceedings{grigoryan-etal-2026-nemo,
title = "{N}e{M}o@{IWSLT} 2026: Cascaded System for Simultaneous Speech Translation",
author = "Grigoryan, Lilit and
Bataev, Vladimir and
Andrusenko, Andrei and
Hrinchuk, Oleksii and
Karamyan, Davit and
Albasiri, Enas and
Lavrukhin, Vitaly and
Karpov, Nikolay and
Ginsburg, Boris",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwslt-1.23/",
pages = "204--211",
ISBN = "979-8-89176-411-8",
abstract = "This paper describes the NVIDIA NeMo team{'}s submission to the IWSLT 2026 Simultaneous Speech Translation (SimulST) tracks. We use a cascaded architecture combining a dual-mode Unified ASR Transducer model with a multilingual Large Language Model (LLM). The ASR is trained to deliver stable transcriptions across wide range of latencies, providing a reliable foundation for high-quality LLM translation. Our submission participates in the English{--}German, English{--}Italian, and English{--}Chinese tasks, in both standard and contextualized settings, as well as the Czech{--}English standard track, covering both low- and high-latency scenarios. We further analyze how ASR and LLM design choices affect the system{'}s overall latency and translation quality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="grigoryan-etal-2026-nemo">
<titleInfo>
<title>NeMo@IWSLT 2026: Cascaded System for Simultaneous Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lilit</namePart>
<namePart type="family">Grigoryan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladimir</namePart>
<namePart type="family">Bataev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrei</namePart>
<namePart type="family">Andrusenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleksii</namePart>
<namePart type="family">Hrinchuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Davit</namePart>
<namePart type="family">Karamyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enas</namePart>
<namePart type="family">Albasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitaly</namePart>
<namePart type="family">Lavrukhin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Karpov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boris</namePart>
<namePart type="family">Ginsburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Negri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-411-8</identifier>
</relatedItem>
<abstract>This paper describes the NVIDIA NeMo team’s submission to the IWSLT 2026 Simultaneous Speech Translation (SimulST) tracks. We use a cascaded architecture combining a dual-mode Unified ASR Transducer model with a multilingual Large Language Model (LLM). The ASR is trained to deliver stable transcriptions across wide range of latencies, providing a reliable foundation for high-quality LLM translation. Our submission participates in the English–German, English–Italian, and English–Chinese tasks, in both standard and contextualized settings, as well as the Czech–English standard track, covering both low- and high-latency scenarios. We further analyze how ASR and LLM design choices affect the system’s overall latency and translation quality.</abstract>
<identifier type="citekey">grigoryan-etal-2026-nemo</identifier>
<location>
<url>https://aclanthology.org/2026.iwslt-1.23/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>204</start>
<end>211</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NeMo@IWSLT 2026: Cascaded System for Simultaneous Speech Translation
%A Grigoryan, Lilit
%A Bataev, Vladimir
%A Andrusenko, Andrei
%A Hrinchuk, Oleksii
%A Karamyan, Davit
%A Albasiri, Enas
%A Lavrukhin, Vitaly
%A Karpov, Nikolay
%A Ginsburg, Boris
%Y Salesky, Elizabeth
%Y Anastasopoulos, Antonios
%Y Negri, Matteo
%Y Federico, Marcello
%S Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA (in-person and online)
%@ 979-8-89176-411-8
%F grigoryan-etal-2026-nemo
%X This paper describes the NVIDIA NeMo team’s submission to the IWSLT 2026 Simultaneous Speech Translation (SimulST) tracks. We use a cascaded architecture combining a dual-mode Unified ASR Transducer model with a multilingual Large Language Model (LLM). The ASR is trained to deliver stable transcriptions across wide range of latencies, providing a reliable foundation for high-quality LLM translation. Our submission participates in the English–German, English–Italian, and English–Chinese tasks, in both standard and contextualized settings, as well as the Czech–English standard track, covering both low- and high-latency scenarios. We further analyze how ASR and LLM design choices affect the system’s overall latency and translation quality.
%U https://aclanthology.org/2026.iwslt-1.23/
%P 204-211
Markdown (Informal)
[NeMo@IWSLT 2026: Cascaded System for Simultaneous Speech Translation](https://aclanthology.org/2026.iwslt-1.23/) (Grigoryan et al., IWSLT 2026)
ACL
- Lilit Grigoryan, Vladimir Bataev, Andrei Andrusenko, Oleksii Hrinchuk, Davit Karamyan, Enas Albasiri, Vitaly Lavrukhin, Nikolay Karpov, and Boris Ginsburg. 2026. NeMo@IWSLT 2026: Cascaded System for Simultaneous Speech Translation. In Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026), pages 204–211, San Diego, USA (in-person and online). Association for Computational Linguistics.