@inproceedings{bane-2024-evaluating,
title = "Evaluating End-to-End Speech-to-Speech Translation for Dubbing: Challenges and New Metrics",
author = "Bane, Fred",
editor = "Martindale, Marianna and
Campbell, Janice and
Savenkov, Konstantin and
Goel, Shivali",
booktitle = "Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)",
month = sep,
year = "2024",
address = "Chicago, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2024.amta-presentations.13",
pages = "184--207",
abstract = "The advent of end-to-end speech-to-speech translation (S2ST) systems in recent years marks a significant advancement over traditional cascaded approaches. These novel systems represent a direct translation pathway from spoken input to spoken output without relying on intermediate text forms. However, evaluation methods for this task, such as ASR BLEU, are often still compartmentalized and text-based. We suggest the quality of the resulting speech must be measured too. Naturalness, similarity of the target voice to the original, reflection of accents, and rhythm are all important. We argue that new evaluation metrics are needed in response to this watershed change. Our presentation approaches this topic through the lens of dubbing, with a particular focus on voice over. We begin with a critical examination of existing metrics. Then we discuss key features of S2ST that are inadequately captured. Finally, we propose new directions for evaluation of S2ST systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bane-2024-evaluating">
<titleInfo>
<title>Evaluating End-to-End Speech-to-Speech Translation for Dubbing: Challenges and New Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fred</namePart>
<namePart type="family">Bane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Martindale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivali</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Chicago, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advent of end-to-end speech-to-speech translation (S2ST) systems in recent years marks a significant advancement over traditional cascaded approaches. These novel systems represent a direct translation pathway from spoken input to spoken output without relying on intermediate text forms. However, evaluation methods for this task, such as ASR BLEU, are often still compartmentalized and text-based. We suggest the quality of the resulting speech must be measured too. Naturalness, similarity of the target voice to the original, reflection of accents, and rhythm are all important. We argue that new evaluation metrics are needed in response to this watershed change. Our presentation approaches this topic through the lens of dubbing, with a particular focus on voice over. We begin with a critical examination of existing metrics. Then we discuss key features of S2ST that are inadequately captured. Finally, we propose new directions for evaluation of S2ST systems.</abstract>
<identifier type="citekey">bane-2024-evaluating</identifier>
<location>
<url>https://aclanthology.org/2024.amta-presentations.13</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>184</start>
<end>207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating End-to-End Speech-to-Speech Translation for Dubbing: Challenges and New Metrics
%A Bane, Fred
%Y Martindale, Marianna
%Y Campbell, Janice
%Y Savenkov, Konstantin
%Y Goel, Shivali
%S Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)
%D 2024
%8 September
%I Association for Machine Translation in the Americas
%C Chicago, USA
%F bane-2024-evaluating
%X The advent of end-to-end speech-to-speech translation (S2ST) systems in recent years marks a significant advancement over traditional cascaded approaches. These novel systems represent a direct translation pathway from spoken input to spoken output without relying on intermediate text forms. However, evaluation methods for this task, such as ASR BLEU, are often still compartmentalized and text-based. We suggest the quality of the resulting speech must be measured too. Naturalness, similarity of the target voice to the original, reflection of accents, and rhythm are all important. We argue that new evaluation metrics are needed in response to this watershed change. Our presentation approaches this topic through the lens of dubbing, with a particular focus on voice over. We begin with a critical examination of existing metrics. Then we discuss key features of S2ST that are inadequately captured. Finally, we propose new directions for evaluation of S2ST systems.
%U https://aclanthology.org/2024.amta-presentations.13
%P 184-207
Markdown (Informal)
[Evaluating End-to-End Speech-to-Speech Translation for Dubbing: Challenges and New Metrics](https://aclanthology.org/2024.amta-presentations.13) (Bane, AMTA 2024)
ACL