@inproceedings{dutta-chowdhury-elliott-2019-understanding,
title = "Understanding the Effect of Textual Adversaries in Multimodal Machine Translation",
author = "Dutta Chowdhury, Koel and
Elliott, Desmond",
editor = "Mogadala, Aditya and
Klakow, Dietrich and
Pezzelle, Sandro and
Moens, Marie-Francine",
booktitle = "Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-6406",
doi = "10.18653/v1/D19-6406",
pages = "35--40",
abstract = "It is assumed that multimodal machine translation systems are better than text-only systems at translating phrases that have a direct correspondence in the image. This assumption has been challenged in experiments demonstrating that state-of-the-art multimodal systems perform equally well in the presence of randomly selected images, but, more recently, it has been shown that masking entities from the source language sentence during training can help to overcome this problem. In this paper, we conduct experiments with both visual and textual adversaries in order to understand the role of incorrect textual inputs to such systems. Our results show that when the source language sentence contains mistakes, multimodal translation systems do not leverage the additional visual signal to produce the correct translation. We also find that the degradation of translation performance caused by textual adversaries is significantly higher than by visual adversaries.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dutta-chowdhury-elliott-2019-understanding">
<titleInfo>
<title>Understanding the Effect of Textual Adversaries in Multimodal Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Koel</namePart>
<namePart type="family">Dutta Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Mogadala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It is assumed that multimodal machine translation systems are better than text-only systems at translating phrases that have a direct correspondence in the image. This assumption has been challenged in experiments demonstrating that state-of-the-art multimodal systems perform equally well in the presence of randomly selected images, but, more recently, it has been shown that masking entities from the source language sentence during training can help to overcome this problem. In this paper, we conduct experiments with both visual and textual adversaries in order to understand the role of incorrect textual inputs to such systems. Our results show that when the source language sentence contains mistakes, multimodal translation systems do not leverage the additional visual signal to produce the correct translation. We also find that the degradation of translation performance caused by textual adversaries is significantly higher than by visual adversaries.</abstract>
<identifier type="citekey">dutta-chowdhury-elliott-2019-understanding</identifier>
<identifier type="doi">10.18653/v1/D19-6406</identifier>
<location>
<url>https://aclanthology.org/D19-6406</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>35</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding the Effect of Textual Adversaries in Multimodal Machine Translation
%A Dutta Chowdhury, Koel
%A Elliott, Desmond
%Y Mogadala, Aditya
%Y Klakow, Dietrich
%Y Pezzelle, Sandro
%Y Moens, Marie-Francine
%S Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F dutta-chowdhury-elliott-2019-understanding
%X It is assumed that multimodal machine translation systems are better than text-only systems at translating phrases that have a direct correspondence in the image. This assumption has been challenged in experiments demonstrating that state-of-the-art multimodal systems perform equally well in the presence of randomly selected images, but, more recently, it has been shown that masking entities from the source language sentence during training can help to overcome this problem. In this paper, we conduct experiments with both visual and textual adversaries in order to understand the role of incorrect textual inputs to such systems. Our results show that when the source language sentence contains mistakes, multimodal translation systems do not leverage the additional visual signal to produce the correct translation. We also find that the degradation of translation performance caused by textual adversaries is significantly higher than by visual adversaries.
%R 10.18653/v1/D19-6406
%U https://aclanthology.org/D19-6406
%U https://doi.org/10.18653/v1/D19-6406
%P 35-40
Markdown (Informal)
[Understanding the Effect of Textual Adversaries in Multimodal Machine Translation](https://aclanthology.org/D19-6406) (Dutta Chowdhury & Elliott, 2019)
ACL