@inproceedings{mazuecos-etal-2021-region,
title = "Region under {D}iscussion for visual dialog",
author = "Mazuecos, Mauricio and
Luque, Franco M. and
S{\'a}nchez, Jorge and
Maina, Hern{\'a}n and
Vadora, Thomas and
Benotti, Luciana",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.390",
doi = "10.18653/v1/2021.emnlp-main.390",
pages = "4745--4759",
abstract = "Visual Dialog is assumed to require the dialog history to generate correct responses during a dialog. However, it is not clear from previous work how dialog history is needed for visual dialog. In this paper we define what it means for a visual question to require dialog history and we release a subset of the Guesswhat?! questions for which their dialog history completely changes their responses. We propose a novel interpretable representation that visually grounds dialog history: the Region under Discussion. It constrains the image{'}s spatial features according to a semantic representation of the history inspired by the information structure notion of Question under Discussion.We evaluate the architecture on task-specific multimodal models and the visual transformer model LXMERT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mazuecos-etal-2021-region">
<titleInfo>
<title>Region under Discussion for visual dialog</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mauricio</namePart>
<namePart type="family">Mazuecos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franco</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Luque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hernán</namePart>
<namePart type="family">Maina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Vadora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Benotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Dialog is assumed to require the dialog history to generate correct responses during a dialog. However, it is not clear from previous work how dialog history is needed for visual dialog. In this paper we define what it means for a visual question to require dialog history and we release a subset of the Guesswhat?! questions for which their dialog history completely changes their responses. We propose a novel interpretable representation that visually grounds dialog history: the Region under Discussion. It constrains the image’s spatial features according to a semantic representation of the history inspired by the information structure notion of Question under Discussion.We evaluate the architecture on task-specific multimodal models and the visual transformer model LXMERT.</abstract>
<identifier type="citekey">mazuecos-etal-2021-region</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.390</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.390</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>4745</start>
<end>4759</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Region under Discussion for visual dialog
%A Mazuecos, Mauricio
%A Luque, Franco M.
%A Sánchez, Jorge
%A Maina, Hernán
%A Vadora, Thomas
%A Benotti, Luciana
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F mazuecos-etal-2021-region
%X Visual Dialog is assumed to require the dialog history to generate correct responses during a dialog. However, it is not clear from previous work how dialog history is needed for visual dialog. In this paper we define what it means for a visual question to require dialog history and we release a subset of the Guesswhat?! questions for which their dialog history completely changes their responses. We propose a novel interpretable representation that visually grounds dialog history: the Region under Discussion. It constrains the image’s spatial features according to a semantic representation of the history inspired by the information structure notion of Question under Discussion.We evaluate the architecture on task-specific multimodal models and the visual transformer model LXMERT.
%R 10.18653/v1/2021.emnlp-main.390
%U https://aclanthology.org/2021.emnlp-main.390
%U https://doi.org/10.18653/v1/2021.emnlp-main.390
%P 4745-4759
Markdown (Informal)
[Region under Discussion for visual dialog](https://aclanthology.org/2021.emnlp-main.390) (Mazuecos et al., EMNLP 2021)
ACL
- Mauricio Mazuecos, Franco M. Luque, Jorge Sánchez, Hernán Maina, Thomas Vadora, and Luciana Benotti. 2021. Region under Discussion for visual dialog. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 4745–4759, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.