@inproceedings{liu-niehues-2024-recent,
title = "Recent Highlights in Multilingual and Multimodal Speech Translation",
author = "Liu, Danni and
Niehues, Jan",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Carpuat, Marine",
booktitle = "Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.iwslt-1.29",
doi = "10.18653/v1/2024.iwslt-1.29",
pages = "235--253",
abstract = "Speech translation has witnessed significant progress driven by advancements in modeling techniques and the growing availability of training data. In this paper, we highlight recent advances in two ongoing research directions in ST: scaling the models to 1) many translation directions (multilingual ST) and 2) beyond the text output modality (multimodal ST). We structure this review by examining the sequential stages of a model{'}s development lifecycle: determining training resources, selecting model architecture, training procedures, evaluation metrics, and deployment considerations. We aim to highlight recent developments in each stage, with a particular focus on model architectures (dedicated speech translation models and LLM-based general-purpose model) and training procedures (task-specific vs. task-invariant approaches). Based on the reviewed advancements, we identify and discuss ongoing challenges within the field of speech translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-niehues-2024-recent">
<titleInfo>
<title>Recent Highlights in Multilingual and Multimodal Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danni</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Niehues</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speech translation has witnessed significant progress driven by advancements in modeling techniques and the growing availability of training data. In this paper, we highlight recent advances in two ongoing research directions in ST: scaling the models to 1) many translation directions (multilingual ST) and 2) beyond the text output modality (multimodal ST). We structure this review by examining the sequential stages of a model’s development lifecycle: determining training resources, selecting model architecture, training procedures, evaluation metrics, and deployment considerations. We aim to highlight recent developments in each stage, with a particular focus on model architectures (dedicated speech translation models and LLM-based general-purpose model) and training procedures (task-specific vs. task-invariant approaches). Based on the reviewed advancements, we identify and discuss ongoing challenges within the field of speech translation.</abstract>
<identifier type="citekey">liu-niehues-2024-recent</identifier>
<identifier type="doi">10.18653/v1/2024.iwslt-1.29</identifier>
<location>
<url>https://aclanthology.org/2024.iwslt-1.29</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>235</start>
<end>253</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Recent Highlights in Multilingual and Multimodal Speech Translation
%A Liu, Danni
%A Niehues, Jan
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Carpuat, Marine
%S Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand (in-person and online)
%F liu-niehues-2024-recent
%X Speech translation has witnessed significant progress driven by advancements in modeling techniques and the growing availability of training data. In this paper, we highlight recent advances in two ongoing research directions in ST: scaling the models to 1) many translation directions (multilingual ST) and 2) beyond the text output modality (multimodal ST). We structure this review by examining the sequential stages of a model’s development lifecycle: determining training resources, selecting model architecture, training procedures, evaluation metrics, and deployment considerations. We aim to highlight recent developments in each stage, with a particular focus on model architectures (dedicated speech translation models and LLM-based general-purpose model) and training procedures (task-specific vs. task-invariant approaches). Based on the reviewed advancements, we identify and discuss ongoing challenges within the field of speech translation.
%R 10.18653/v1/2024.iwslt-1.29
%U https://aclanthology.org/2024.iwslt-1.29
%U https://doi.org/10.18653/v1/2024.iwslt-1.29
%P 235-253
Markdown (Informal)
[Recent Highlights in Multilingual and Multimodal Speech Translation](https://aclanthology.org/2024.iwslt-1.29) (Liu & Niehues, IWSLT 2024)
ACL