@inproceedings{pezzelle-etal-2020-different,
title = "{B}e {D}ifferent to {B}e {B}etter! {A} {B}enchmark to {L}everage the {C}omplementarity of {L}anguage and {V}ision",
author = "Pezzelle, Sandro and
Greco, Claudio and
Gandolfi, Greta and
Gualdoni, Eleonora and
Bernardi, Raffaella",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.248",
doi = "10.18653/v1/2020.findings-emnlp.248",
pages = "2751--2767",
abstract = "This paper introduces BD2BB, a novel language and vision benchmark that requires multimodal models combine complementary information from the two modalities. Recently, impressive progress has been made to develop universal multimodal encoders suitable for virtually any language and vision tasks. However, current approaches often require them to combine redundant information provided by language and vision. Inspired by real-life communicative contexts, we propose a novel task where either modality is necessary but not sufficient to make a correct prediction. To do so, we first build a dataset of images and corresponding sentences provided by human participants. Second, we evaluate state-of-the-art models and compare their performance against human speakers. We show that, while the task is relatively easy for humans, best-performing models struggle to achieve similar results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pezzelle-etal-2020-different">
<titleInfo>
<title>Be Different to Be Better! A Benchmark to Leverage the Complementarity of Language and Vision</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Greco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Greta</namePart>
<namePart type="family">Gandolfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleonora</namePart>
<namePart type="family">Gualdoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces BD2BB, a novel language and vision benchmark that requires multimodal models combine complementary information from the two modalities. Recently, impressive progress has been made to develop universal multimodal encoders suitable for virtually any language and vision tasks. However, current approaches often require them to combine redundant information provided by language and vision. Inspired by real-life communicative contexts, we propose a novel task where either modality is necessary but not sufficient to make a correct prediction. To do so, we first build a dataset of images and corresponding sentences provided by human participants. Second, we evaluate state-of-the-art models and compare their performance against human speakers. We show that, while the task is relatively easy for humans, best-performing models struggle to achieve similar results.</abstract>
<identifier type="citekey">pezzelle-etal-2020-different</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.248</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.248</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2751</start>
<end>2767</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Be Different to Be Better! A Benchmark to Leverage the Complementarity of Language and Vision
%A Pezzelle, Sandro
%A Greco, Claudio
%A Gandolfi, Greta
%A Gualdoni, Eleonora
%A Bernardi, Raffaella
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F pezzelle-etal-2020-different
%X This paper introduces BD2BB, a novel language and vision benchmark that requires multimodal models combine complementary information from the two modalities. Recently, impressive progress has been made to develop universal multimodal encoders suitable for virtually any language and vision tasks. However, current approaches often require them to combine redundant information provided by language and vision. Inspired by real-life communicative contexts, we propose a novel task where either modality is necessary but not sufficient to make a correct prediction. To do so, we first build a dataset of images and corresponding sentences provided by human participants. Second, we evaluate state-of-the-art models and compare their performance against human speakers. We show that, while the task is relatively easy for humans, best-performing models struggle to achieve similar results.
%R 10.18653/v1/2020.findings-emnlp.248
%U https://aclanthology.org/2020.findings-emnlp.248
%U https://doi.org/10.18653/v1/2020.findings-emnlp.248
%P 2751-2767
Markdown (Informal)
[Be Different to Be Better! A Benchmark to Leverage the Complementarity of Language and Vision](https://aclanthology.org/2020.findings-emnlp.248) (Pezzelle et al., Findings 2020)
ACL