@inproceedings{gwinnup-etal-2023-enhancing,
title = "Enhancing Video Translation Context with Object Labels",
author = "Gwinnup, Jeremy and
Anderson, Tim and
Ore, Brian and
Hansen, Eric and
Duh, Kevin",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Carpuat, Marine",
booktitle = "Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.iwslt-1.8",
doi = "10.18653/v1/2023.iwslt-1.8",
pages = "130--137",
abstract = "We present a simple yet efficient method to enhance the quality of machine translation models trained on multimodal corpora by augmenting the training text with labels of detected objects in the corresponding video segments. We then test the effects of label augmentation in both baseline and two automatic speech recognition (ASR) conditions. In contrast with multimodal techniques that merge visual and textual features, our modular method is easy to implement and the results are more interpretable. Comparisons are made with Transformer translation architectures trained with baseline and augmented labels, showing improvements of up to +1.0 BLEU on the How2 dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gwinnup-etal-2023-enhancing">
<titleInfo>
<title>Enhancing Video Translation Context with Object Labels</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Gwinnup</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Ore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Hansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a simple yet efficient method to enhance the quality of machine translation models trained on multimodal corpora by augmenting the training text with labels of detected objects in the corresponding video segments. We then test the effects of label augmentation in both baseline and two automatic speech recognition (ASR) conditions. In contrast with multimodal techniques that merge visual and textual features, our modular method is easy to implement and the results are more interpretable. Comparisons are made with Transformer translation architectures trained with baseline and augmented labels, showing improvements of up to +1.0 BLEU on the How2 dataset.</abstract>
<identifier type="citekey">gwinnup-etal-2023-enhancing</identifier>
<identifier type="doi">10.18653/v1/2023.iwslt-1.8</identifier>
<location>
<url>https://aclanthology.org/2023.iwslt-1.8</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>130</start>
<end>137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Video Translation Context with Object Labels
%A Gwinnup, Jeremy
%A Anderson, Tim
%A Ore, Brian
%A Hansen, Eric
%A Duh, Kevin
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Carpuat, Marine
%S Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada (in-person and online)
%F gwinnup-etal-2023-enhancing
%X We present a simple yet efficient method to enhance the quality of machine translation models trained on multimodal corpora by augmenting the training text with labels of detected objects in the corresponding video segments. We then test the effects of label augmentation in both baseline and two automatic speech recognition (ASR) conditions. In contrast with multimodal techniques that merge visual and textual features, our modular method is easy to implement and the results are more interpretable. Comparisons are made with Transformer translation architectures trained with baseline and augmented labels, showing improvements of up to +1.0 BLEU on the How2 dataset.
%R 10.18653/v1/2023.iwslt-1.8
%U https://aclanthology.org/2023.iwslt-1.8
%U https://doi.org/10.18653/v1/2023.iwslt-1.8
%P 130-137
Markdown (Informal)
[Enhancing Video Translation Context with Object Labels](https://aclanthology.org/2023.iwslt-1.8) (Gwinnup et al., IWSLT 2023)
ACL
- Jeremy Gwinnup, Tim Anderson, Brian Ore, Eric Hansen, and Kevin Duh. 2023. Enhancing Video Translation Context with Object Labels. In Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023), pages 130–137, Toronto, Canada (in-person and online). Association for Computational Linguistics.