@inproceedings{pong-2026-towards,
title = "Towards Dynamic Attention Masking for Simultaneous Speech Translation",
author = "Pong, Benjamin",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwslt-1.20/",
pages = "183--188",
ISBN = "979-8-89176-411-8",
abstract = "We present a proof-of-concept system for simultaneous speech translation based on dynamic attention masking. Our approach builds on SeamlessM4T by injecting lightweight per-layer schedulers into the conformer-encoder, training each scheduler to predict the number of future frames needed for translation. The schedulers are trained jointly with LoRA adapters across three language directions: English to German, Italian, and Chinese. At inference time, we evaluate our system using sliding window retranslation inference regime (Sen et al., 2022), and an adapted version of StreamAtt (Papi et al., 2024) that replaces the fixed cutoff with a content-aware threshold derived from the learnt representations from the scheduler outputs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pong-2026-towards">
<titleInfo>
<title>Towards Dynamic Attention Masking for Simultaneous Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Pong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Negri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-411-8</identifier>
</relatedItem>
<abstract>We present a proof-of-concept system for simultaneous speech translation based on dynamic attention masking. Our approach builds on SeamlessM4T by injecting lightweight per-layer schedulers into the conformer-encoder, training each scheduler to predict the number of future frames needed for translation. The schedulers are trained jointly with LoRA adapters across three language directions: English to German, Italian, and Chinese. At inference time, we evaluate our system using sliding window retranslation inference regime (Sen et al., 2022), and an adapted version of StreamAtt (Papi et al., 2024) that replaces the fixed cutoff with a content-aware threshold derived from the learnt representations from the scheduler outputs.</abstract>
<identifier type="citekey">pong-2026-towards</identifier>
<location>
<url>https://aclanthology.org/2026.iwslt-1.20/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>183</start>
<end>188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Dynamic Attention Masking for Simultaneous Speech Translation
%A Pong, Benjamin
%Y Salesky, Elizabeth
%Y Anastasopoulos, Antonios
%Y Negri, Matteo
%Y Federico, Marcello
%S Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA (in-person and online)
%@ 979-8-89176-411-8
%F pong-2026-towards
%X We present a proof-of-concept system for simultaneous speech translation based on dynamic attention masking. Our approach builds on SeamlessM4T by injecting lightweight per-layer schedulers into the conformer-encoder, training each scheduler to predict the number of future frames needed for translation. The schedulers are trained jointly with LoRA adapters across three language directions: English to German, Italian, and Chinese. At inference time, we evaluate our system using sliding window retranslation inference regime (Sen et al., 2022), and an adapted version of StreamAtt (Papi et al., 2024) that replaces the fixed cutoff with a content-aware threshold derived from the learnt representations from the scheduler outputs.
%U https://aclanthology.org/2026.iwslt-1.20/
%P 183-188
Markdown (Informal)
[Towards Dynamic Attention Masking for Simultaneous Speech Translation](https://aclanthology.org/2026.iwslt-1.20/) (Pong, IWSLT 2026)
ACL