@inproceedings{ye-etal-2022-cross,
title = "Cross-modal Contrastive Learning for Speech Translation",
author = "Ye, Rong and
Wang, Mingxuan and
Li, Lei",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.376",
doi = "10.18653/v1/2022.naacl-main.376",
pages = "5099--5113",
abstract = "How can we learn unified representations for spoken utterances and their written text? Learning similar representations for semantically similar speech and text is important for speech translation. To this end, we propose ConST, a cross-modal contrastive learning method for end-to-end speech-to-text translation. We evaluate ConST and a variety of previous baselines on a popular benchmark MuST-C. Experiments show that the proposed ConST consistently outperforms the previous methods, and achieves an average BLEU of 29.4. The analysis further verifies that ConST indeed closes the representation gap of different modalities {---} its learned representation improves the accuracy of cross-modal speech-text retrieval from 4{\%} to 88{\%}. Code and models are available at \url{https://github.com/ReneeYe/ConST}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2022-cross">
<titleInfo>
<title>Cross-modal Contrastive Learning for Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rong</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>How can we learn unified representations for spoken utterances and their written text? Learning similar representations for semantically similar speech and text is important for speech translation. To this end, we propose ConST, a cross-modal contrastive learning method for end-to-end speech-to-text translation. We evaluate ConST and a variety of previous baselines on a popular benchmark MuST-C. Experiments show that the proposed ConST consistently outperforms the previous methods, and achieves an average BLEU of 29.4. The analysis further verifies that ConST indeed closes the representation gap of different modalities — its learned representation improves the accuracy of cross-modal speech-text retrieval from 4% to 88%. Code and models are available at https://github.com/ReneeYe/ConST.</abstract>
<identifier type="citekey">ye-etal-2022-cross</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-main.376</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-main.376</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>5099</start>
<end>5113</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-modal Contrastive Learning for Speech Translation
%A Ye, Rong
%A Wang, Mingxuan
%A Li, Lei
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F ye-etal-2022-cross
%X How can we learn unified representations for spoken utterances and their written text? Learning similar representations for semantically similar speech and text is important for speech translation. To this end, we propose ConST, a cross-modal contrastive learning method for end-to-end speech-to-text translation. We evaluate ConST and a variety of previous baselines on a popular benchmark MuST-C. Experiments show that the proposed ConST consistently outperforms the previous methods, and achieves an average BLEU of 29.4. The analysis further verifies that ConST indeed closes the representation gap of different modalities — its learned representation improves the accuracy of cross-modal speech-text retrieval from 4% to 88%. Code and models are available at https://github.com/ReneeYe/ConST.
%R 10.18653/v1/2022.naacl-main.376
%U https://aclanthology.org/2022.naacl-main.376
%U https://doi.org/10.18653/v1/2022.naacl-main.376
%P 5099-5113
Markdown (Informal)
[Cross-modal Contrastive Learning for Speech Translation](https://aclanthology.org/2022.naacl-main.376) (Ye et al., NAACL 2022)
ACL
- Rong Ye, Mingxuan Wang, and Lei Li. 2022. Cross-modal Contrastive Learning for Speech Translation. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 5099–5113, Seattle, United States. Association for Computational Linguistics.