@inproceedings{zanon-boito-etal-2026-naver,
title = "{NAVER} {LABS} {E}urope Submission to the Instruction-following 2026 Short Track",
author = "Zanon Boito, Marcely and
Yadav, Hemant and
Meunier, Jean-Luc and
Calapodescu, Ioan",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwslt-1.17/",
pages = "150--163",
ISBN = "979-8-89176-411-8",
abstract = "In this paper, we describe NAVER LABS Europe{'}s submission to the instruction-following speech processing short track at IWSLT 2026. We participate again in the constrained setting, developing systems capable of jointly performing ASR, ST, and SQA from English speech into Chinese, Italian, and German. Building on our previous submission, ranked first in last year{'}s short track, we update our multi-stage training pipeline by replacing the speech projector with SpeechMapper, a method for learning a speech-to-LLM embedding projector using ASR-only data. In addition, we introduce a synthetic SQA dataset, fakACL, composed of artificially generated scientific presentations. This dataset is built by prompting the LLM backbone, segmenting the generated talks, and synthesizing speech with Seamless. The combination of an improved speech projection mechanism and domain-specific synthetic data allows our model to outperform last year{'}s best short-track system, while being considerably more compact and relying on a weaker LLM backbone."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zanon-boito-etal-2026-naver">
<titleInfo>
<title>NAVER LABS Europe Submission to the Instruction-following 2026 Short Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcely</namePart>
<namePart type="family">Zanon Boito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hemant</namePart>
<namePart type="family">Yadav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Luc</namePart>
<namePart type="family">Meunier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioan</namePart>
<namePart type="family">Calapodescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Negri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-411-8</identifier>
</relatedItem>
<abstract>In this paper, we describe NAVER LABS Europe’s submission to the instruction-following speech processing short track at IWSLT 2026. We participate again in the constrained setting, developing systems capable of jointly performing ASR, ST, and SQA from English speech into Chinese, Italian, and German. Building on our previous submission, ranked first in last year’s short track, we update our multi-stage training pipeline by replacing the speech projector with SpeechMapper, a method for learning a speech-to-LLM embedding projector using ASR-only data. In addition, we introduce a synthetic SQA dataset, fakACL, composed of artificially generated scientific presentations. This dataset is built by prompting the LLM backbone, segmenting the generated talks, and synthesizing speech with Seamless. The combination of an improved speech projection mechanism and domain-specific synthetic data allows our model to outperform last year’s best short-track system, while being considerably more compact and relying on a weaker LLM backbone.</abstract>
<identifier type="citekey">zanon-boito-etal-2026-naver</identifier>
<location>
<url>https://aclanthology.org/2026.iwslt-1.17/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>150</start>
<end>163</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NAVER LABS Europe Submission to the Instruction-following 2026 Short Track
%A Zanon Boito, Marcely
%A Yadav, Hemant
%A Meunier, Jean-Luc
%A Calapodescu, Ioan
%Y Salesky, Elizabeth
%Y Anastasopoulos, Antonios
%Y Negri, Matteo
%Y Federico, Marcello
%S Proceedings of the 23rd International Conference on Spoken Language Translation (IWSLT 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA (in-person and online)
%@ 979-8-89176-411-8
%F zanon-boito-etal-2026-naver
%X In this paper, we describe NAVER LABS Europe’s submission to the instruction-following speech processing short track at IWSLT 2026. We participate again in the constrained setting, developing systems capable of jointly performing ASR, ST, and SQA from English speech into Chinese, Italian, and German. Building on our previous submission, ranked first in last year’s short track, we update our multi-stage training pipeline by replacing the speech projector with SpeechMapper, a method for learning a speech-to-LLM embedding projector using ASR-only data. In addition, we introduce a synthetic SQA dataset, fakACL, composed of artificially generated scientific presentations. This dataset is built by prompting the LLM backbone, segmenting the generated talks, and synthesizing speech with Seamless. The combination of an improved speech projection mechanism and domain-specific synthetic data allows our model to outperform last year’s best short-track system, while being considerably more compact and relying on a weaker LLM backbone.
%U https://aclanthology.org/2026.iwslt-1.17/
%P 150-163
Markdown (Informal)
[NAVER LABS Europe Submission to the Instruction-following 2026 Short Track](https://aclanthology.org/2026.iwslt-1.17/) (Zanon Boito et al., IWSLT 2026)
ACL