@inproceedings{tsiamas-etal-2024-pushing,
title = "Pushing the Limits of Zero-shot End-to-End Speech Translation",
author = "Tsiamas, Ioannis and
G{\'a}llego, Gerard I. and
Fonollosa, Jos{\'e} A. R. and
Costa-juss{\`a}, Marta R.",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.847/",
doi = "10.18653/v1/2024.findings-acl.847",
pages = "14245--14267",
abstract = "Data scarcity and the modality gap between the speech and text modalities are two major obstacles of end-to-end Speech Translation (ST) systems, thus hindering their performance. Prior work has attempted to mitigate these challenges by leveraging external MT data and optimizing distance metrics that bring closer the speech-text representations. However, achieving competitive results typically requires some ST data. For this reason, we introduce ZeroSwot, a method for zero-shot ST that bridges the modality gap without any paired ST data. Leveraging a novel CTC compression and Optimal Transport, we train a speech encoder using only ASR data, to align with the representation space of a massively multilingual MT model. The speech encoder seamlessly integrates with the MT model at inference, enabling direct translation from speech to text, across all languages supported by the MT model. Our experiments show that we can effectively close the modality gap without ST data, while our results on MuST-C and CoVoST demonstrate our method{'}s superiority over not only previous zero-shot models, but also supervised ones, achieving state-of-the-art results."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tsiamas-etal-2024-pushing">
<titleInfo>
<title>Pushing the Limits of Zero-shot End-to-End Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ioannis</namePart>
<namePart type="family">Tsiamas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="given">I</namePart>
<namePart type="family">Gállego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="given">A</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Fonollosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data scarcity and the modality gap between the speech and text modalities are two major obstacles of end-to-end Speech Translation (ST) systems, thus hindering their performance. Prior work has attempted to mitigate these challenges by leveraging external MT data and optimizing distance metrics that bring closer the speech-text representations. However, achieving competitive results typically requires some ST data. For this reason, we introduce ZeroSwot, a method for zero-shot ST that bridges the modality gap without any paired ST data. Leveraging a novel CTC compression and Optimal Transport, we train a speech encoder using only ASR data, to align with the representation space of a massively multilingual MT model. The speech encoder seamlessly integrates with the MT model at inference, enabling direct translation from speech to text, across all languages supported by the MT model. Our experiments show that we can effectively close the modality gap without ST data, while our results on MuST-C and CoVoST demonstrate our method’s superiority over not only previous zero-shot models, but also supervised ones, achieving state-of-the-art results.</abstract>
<identifier type="citekey">tsiamas-etal-2024-pushing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.847</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.847/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>14245</start>
<end>14267</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pushing the Limits of Zero-shot End-to-End Speech Translation
%A Tsiamas, Ioannis
%A Gállego, Gerard I.
%A Fonollosa, José A. R.
%A Costa-jussà, Marta R.
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F tsiamas-etal-2024-pushing
%X Data scarcity and the modality gap between the speech and text modalities are two major obstacles of end-to-end Speech Translation (ST) systems, thus hindering their performance. Prior work has attempted to mitigate these challenges by leveraging external MT data and optimizing distance metrics that bring closer the speech-text representations. However, achieving competitive results typically requires some ST data. For this reason, we introduce ZeroSwot, a method for zero-shot ST that bridges the modality gap without any paired ST data. Leveraging a novel CTC compression and Optimal Transport, we train a speech encoder using only ASR data, to align with the representation space of a massively multilingual MT model. The speech encoder seamlessly integrates with the MT model at inference, enabling direct translation from speech to text, across all languages supported by the MT model. Our experiments show that we can effectively close the modality gap without ST data, while our results on MuST-C and CoVoST demonstrate our method’s superiority over not only previous zero-shot models, but also supervised ones, achieving state-of-the-art results.
%R 10.18653/v1/2024.findings-acl.847
%U https://aclanthology.org/2024.findings-acl.847/
%U https://doi.org/10.18653/v1/2024.findings-acl.847
%P 14245-14267
Markdown (Informal)
[Pushing the Limits of Zero-shot End-to-End Speech Translation](https://aclanthology.org/2024.findings-acl.847/) (Tsiamas et al., Findings 2024)
ACL
- Ioannis Tsiamas, Gerard I. Gállego, José A. R. Fonollosa, and Marta R. Costa-jussà. 2024. Pushing the Limits of Zero-shot End-to-End Speech Translation. In Findings of the Association for Computational Linguistics: ACL 2024, pages 14245–14267, Bangkok, Thailand. Association for Computational Linguistics.