@inproceedings{attanasio-etal-2025-instituto,
title = "Instituto de Telecomunica{\c{c}}{\~o}es at {IWSLT} 2025: Aligning Small-Scale Speech and Language Models for Speech-to-Text Learning",
author = "Attanasio, Giuseppe and
Sannigrahi, Sonal and
Peters, Ben and
Filipe Torres Martins, Andr{\'e}",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.36/",
doi = "10.18653/v1/2025.iwslt-1.36",
pages = "347--353",
ISBN = "979-8-89176-272-5",
abstract = "This paper presents Instituto de Telecomunica{\c{c}}{\~o}es{'}s submission to the IWSLT 2025 Shared Task on Instruction Following Speech Processing. We submit results for the Short Track, i.e., speech recognition, translation, and spoken question answering. Our model is a unified speech-to-text model that integrates a pretrained continuous speech encoder and text decoder through a first phase of modality alignment and a second phase of instruction fine-tuning. Crucially, we focus on using small-scale language model backbones ({\ensuremath{<}} 2B) and restrict to high-quality, CC-BY data along with synthetic data generation to supplement existing resources."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="attanasio-etal-2025-instituto">
<titleInfo>
<title>Instituto de Telecomunicações at IWSLT 2025: Aligning Small-Scale Speech and Language Models for Speech-to-Text Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Attanasio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sonal</namePart>
<namePart type="family">Sannigrahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Peters</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Filipe Torres Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>This paper presents Instituto de Telecomunicações’s submission to the IWSLT 2025 Shared Task on Instruction Following Speech Processing. We submit results for the Short Track, i.e., speech recognition, translation, and spoken question answering. Our model is a unified speech-to-text model that integrates a pretrained continuous speech encoder and text decoder through a first phase of modality alignment and a second phase of instruction fine-tuning. Crucially, we focus on using small-scale language model backbones (\ensuremath< 2B) and restrict to high-quality, CC-BY data along with synthetic data generation to supplement existing resources.</abstract>
<identifier type="citekey">attanasio-etal-2025-instituto</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.36</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.36/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>347</start>
<end>353</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Instituto de Telecomunicações at IWSLT 2025: Aligning Small-Scale Speech and Language Models for Speech-to-Text Learning
%A Attanasio, Giuseppe
%A Sannigrahi, Sonal
%A Peters, Ben
%A Filipe Torres Martins, André
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F attanasio-etal-2025-instituto
%X This paper presents Instituto de Telecomunicações’s submission to the IWSLT 2025 Shared Task on Instruction Following Speech Processing. We submit results for the Short Track, i.e., speech recognition, translation, and spoken question answering. Our model is a unified speech-to-text model that integrates a pretrained continuous speech encoder and text decoder through a first phase of modality alignment and a second phase of instruction fine-tuning. Crucially, we focus on using small-scale language model backbones (\ensuremath< 2B) and restrict to high-quality, CC-BY data along with synthetic data generation to supplement existing resources.
%R 10.18653/v1/2025.iwslt-1.36
%U https://aclanthology.org/2025.iwslt-1.36/
%U https://doi.org/10.18653/v1/2025.iwslt-1.36
%P 347-353
Markdown (Informal)
[Instituto de Telecomunicações at IWSLT 2025: Aligning Small-Scale Speech and Language Models for Speech-to-Text Learning](https://aclanthology.org/2025.iwslt-1.36/) (Attanasio et al., IWSLT 2025)
ACL