@inproceedings{li-etal-2021-feature,
title = "Feature-level Incongruence Reduction for Multimodal Translation",
author = "Li, Zhifeng and
Hong, Yu and
Pan, Yuchen and
Tang, Jian and
Yao, Jianmin and
Zhou, Guodong",
editor = "{Xin} and
Hu, Ronghang and
Hudson, Drew and
Fu, Tsu-Jui and
Rohrbach, Marcus and
Fried, Daniel",
booktitle = "Proceedings of the Second Workshop on Advances in Language and Vision Research",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.alvr-1.1",
doi = "10.18653/v1/2021.alvr-1.1",
pages = "1--10",
abstract = "Caption translation aims to translate image annotations (captions for short). Recently, Multimodal Neural Machine Translation (MNMT) has been explored as the essential solution. Besides of linguistic features in captions, MNMT allows visual(image) features to be used. The integration of multimodal features reinforces the semantic representation and considerably improves translation performance. However, MNMT suffers from the incongruence between visual and linguistic features. To overcome the problem, we propose to extend MNMT architecture with a harmonization network, which harmonizes multimodal features(linguistic and visual features)by unidirectional modal space conversion. It enables multimodal translation to be carried out in a seemingly monomodal translation pipeline. We experiment on the golden Multi30k-16 and 17. Experimental results show that, compared to the baseline,the proposed method yields the improvements of 2.2{\%} BLEU for the scenario of translating English captions into German (En→De) at best,7.6{\%} for the case of English-to-French translation(En→Fr) and 1.5{\%} for English-to-Czech(En→Cz). The utilization of harmonization network leads to the competitive performance to the-state-of-the-art.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2021-feature">
<titleInfo>
<title>Feature-level Incongruence Reduction for Multimodal Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhifeng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchen</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianmin</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guodong</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Advances in Language and Vision Research</title>
</titleInfo>
<name>
<namePart>Xin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronghang</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drew</namePart>
<namePart type="family">Hudson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsu-Jui</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcus</namePart>
<namePart type="family">Rohrbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Fried</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Caption translation aims to translate image annotations (captions for short). Recently, Multimodal Neural Machine Translation (MNMT) has been explored as the essential solution. Besides of linguistic features in captions, MNMT allows visual(image) features to be used. The integration of multimodal features reinforces the semantic representation and considerably improves translation performance. However, MNMT suffers from the incongruence between visual and linguistic features. To overcome the problem, we propose to extend MNMT architecture with a harmonization network, which harmonizes multimodal features(linguistic and visual features)by unidirectional modal space conversion. It enables multimodal translation to be carried out in a seemingly monomodal translation pipeline. We experiment on the golden Multi30k-16 and 17. Experimental results show that, compared to the baseline,the proposed method yields the improvements of 2.2% BLEU for the scenario of translating English captions into German (En→De) at best,7.6% for the case of English-to-French translation(En→Fr) and 1.5% for English-to-Czech(En→Cz). The utilization of harmonization network leads to the competitive performance to the-state-of-the-art.</abstract>
<identifier type="citekey">li-etal-2021-feature</identifier>
<identifier type="doi">10.18653/v1/2021.alvr-1.1</identifier>
<location>
<url>https://aclanthology.org/2021.alvr-1.1</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>1</start>
<end>10</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Feature-level Incongruence Reduction for Multimodal Translation
%A Li, Zhifeng
%A Hong, Yu
%A Pan, Yuchen
%A Tang, Jian
%A Yao, Jianmin
%A Zhou, Guodong
%Y Hu, Ronghang
%Y Hudson, Drew
%Y Fu, Tsu-Jui
%Y Rohrbach, Marcus
%Y Fried, Daniel
%E Xin
%S Proceedings of the Second Workshop on Advances in Language and Vision Research
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F li-etal-2021-feature
%X Caption translation aims to translate image annotations (captions for short). Recently, Multimodal Neural Machine Translation (MNMT) has been explored as the essential solution. Besides of linguistic features in captions, MNMT allows visual(image) features to be used. The integration of multimodal features reinforces the semantic representation and considerably improves translation performance. However, MNMT suffers from the incongruence between visual and linguistic features. To overcome the problem, we propose to extend MNMT architecture with a harmonization network, which harmonizes multimodal features(linguistic and visual features)by unidirectional modal space conversion. It enables multimodal translation to be carried out in a seemingly monomodal translation pipeline. We experiment on the golden Multi30k-16 and 17. Experimental results show that, compared to the baseline,the proposed method yields the improvements of 2.2% BLEU for the scenario of translating English captions into German (En→De) at best,7.6% for the case of English-to-French translation(En→Fr) and 1.5% for English-to-Czech(En→Cz). The utilization of harmonization network leads to the competitive performance to the-state-of-the-art.
%R 10.18653/v1/2021.alvr-1.1
%U https://aclanthology.org/2021.alvr-1.1
%U https://doi.org/10.18653/v1/2021.alvr-1.1
%P 1-10
Markdown (Informal)
[Feature-level Incongruence Reduction for Multimodal Translation](https://aclanthology.org/2021.alvr-1.1) (Li et al., ALVR 2021)
ACL