@inproceedings{ma-etal-2025-cross,
title = "Cross-Lingual Transfer Learning for Speech Translation",
author = "Ma, Rao and
Qian, Mengjie and
Fathullah, Yassir and
Tang, Siyuan and
Gales, Mark and
Knill, Kate",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.4/",
doi = "10.18653/v1/2025.naacl-short.4",
pages = "33--43",
ISBN = "979-8-89176-190-2",
abstract = "There has been increasing interest in building multilingual foundation models for NLP and speech research. This paper examines how to expand the speech translation capability of these models with restricted data. Whisper, a speech foundation model with strong performance on speech recognition and English translation, is used as the example model. Using speech-to-speech retrieval to analyse the audio representations generated by the encoder, we show that utterances from different languages are mapped to a shared semantic space. This shared embedding space can then be leveraged for zero-shot cross-lingual transfer in speech translation. By fine-tuning the Whisper decoder with only English-to-Chinese speech translation data, improved performance for translation to Chinese can be obtained for multiple languages, in addition to English. Furthermore, for languages related to those seen in training it is possible to perform speech translation, despite the model never seeing the language in training, or being able to perform transcription."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-etal-2025-cross">
<titleInfo>
<title>Cross-Lingual Transfer Learning for Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rao</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengjie</namePart>
<namePart type="family">Qian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yassir</namePart>
<namePart type="family">Fathullah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siyuan</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Gales</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kate</namePart>
<namePart type="family">Knill</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-190-2</identifier>
</relatedItem>
<abstract>There has been increasing interest in building multilingual foundation models for NLP and speech research. This paper examines how to expand the speech translation capability of these models with restricted data. Whisper, a speech foundation model with strong performance on speech recognition and English translation, is used as the example model. Using speech-to-speech retrieval to analyse the audio representations generated by the encoder, we show that utterances from different languages are mapped to a shared semantic space. This shared embedding space can then be leveraged for zero-shot cross-lingual transfer in speech translation. By fine-tuning the Whisper decoder with only English-to-Chinese speech translation data, improved performance for translation to Chinese can be obtained for multiple languages, in addition to English. Furthermore, for languages related to those seen in training it is possible to perform speech translation, despite the model never seeing the language in training, or being able to perform transcription.</abstract>
<identifier type="citekey">ma-etal-2025-cross</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-short.4</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-short.4/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>33</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-Lingual Transfer Learning for Speech Translation
%A Ma, Rao
%A Qian, Mengjie
%A Fathullah, Yassir
%A Tang, Siyuan
%A Gales, Mark
%A Knill, Kate
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-190-2
%F ma-etal-2025-cross
%X There has been increasing interest in building multilingual foundation models for NLP and speech research. This paper examines how to expand the speech translation capability of these models with restricted data. Whisper, a speech foundation model with strong performance on speech recognition and English translation, is used as the example model. Using speech-to-speech retrieval to analyse the audio representations generated by the encoder, we show that utterances from different languages are mapped to a shared semantic space. This shared embedding space can then be leveraged for zero-shot cross-lingual transfer in speech translation. By fine-tuning the Whisper decoder with only English-to-Chinese speech translation data, improved performance for translation to Chinese can be obtained for multiple languages, in addition to English. Furthermore, for languages related to those seen in training it is possible to perform speech translation, despite the model never seeing the language in training, or being able to perform transcription.
%R 10.18653/v1/2025.naacl-short.4
%U https://aclanthology.org/2025.naacl-short.4/
%U https://doi.org/10.18653/v1/2025.naacl-short.4
%P 33-43
Markdown (Informal)
[Cross-Lingual Transfer Learning for Speech Translation](https://aclanthology.org/2025.naacl-short.4/) (Ma et al., NAACL 2025)
ACL
- Rao Ma, Mengjie Qian, Yassir Fathullah, Siyuan Tang, Mark Gales, and Kate Knill. 2025. Cross-Lingual Transfer Learning for Speech Translation. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 33–43, Albuquerque, New Mexico. Association for Computational Linguistics.