@inproceedings{coppock-etal-2020-informativity,
title = "Informativity in Image Captions vs. Referring Expressions",
author = "Coppock, Elizabeth and
Dionne, Danielle and
Graham, Nathanial and
Ganem, Elias and
Zhao, Shijie and
Lin, Shawn and
Liu, Wenxing and
Wijaya, Derry",
editor = "Howes, Christine and
Chatzikyriakidis, Stergios and
Ek, Adam and
Somashekarappa, Vidya",
booktitle = "Proceedings of the Probability and Meaning Conference (PaM 2020)",
month = jun,
year = "2020",
address = "Gothenburg",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.pam-1.14",
pages = "104--108",
abstract = "At the intersection between computer vision and natural language processing, there has been recent progress on two natural language generation tasks: Dense Image Captioning and Referring Expression Generation for objects in complex scenes. The former aims to provide a caption for a specified object in a complex scene for the benefit of an interlocutor who may not be able to see it. The latter aims to produce a referring expression that will serve to identify a given object in a scene that the interlocutor can see. The two tasks are designed for different assumptions about the common ground between the interlocutors, and serve very different purposes, although they both associate a linguistic description with an object in a complex scene. Despite these fundamental differences, the distinction between these two tasks is sometimes overlooked. Here, we undertake a side-by-side comparison between image captioning and reference game human datasets and show that they differ systematically with respect to informativity. We hope that an understanding of the systematic differences among these human datasets will ultimately allow them to be leveraged more effectively in the associated engineering tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="coppock-etal-2020-informativity">
<titleInfo>
<title>Informativity in Image Captions vs. Referring Expressions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Coppock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Dionne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathanial</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elias</namePart>
<namePart type="family">Ganem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shijie</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shawn</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derry</namePart>
<namePart type="family">Wijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Probability and Meaning Conference (PaM 2020)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christine</namePart>
<namePart type="family">Howes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stergios</namePart>
<namePart type="family">Chatzikyriakidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Ek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vidya</namePart>
<namePart type="family">Somashekarappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gothenburg</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>At the intersection between computer vision and natural language processing, there has been recent progress on two natural language generation tasks: Dense Image Captioning and Referring Expression Generation for objects in complex scenes. The former aims to provide a caption for a specified object in a complex scene for the benefit of an interlocutor who may not be able to see it. The latter aims to produce a referring expression that will serve to identify a given object in a scene that the interlocutor can see. The two tasks are designed for different assumptions about the common ground between the interlocutors, and serve very different purposes, although they both associate a linguistic description with an object in a complex scene. Despite these fundamental differences, the distinction between these two tasks is sometimes overlooked. Here, we undertake a side-by-side comparison between image captioning and reference game human datasets and show that they differ systematically with respect to informativity. We hope that an understanding of the systematic differences among these human datasets will ultimately allow them to be leveraged more effectively in the associated engineering tasks.</abstract>
<identifier type="citekey">coppock-etal-2020-informativity</identifier>
<location>
<url>https://aclanthology.org/2020.pam-1.14</url>
</location>
<part>
<date>2020-06</date>
<extent unit="page">
<start>104</start>
<end>108</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Informativity in Image Captions vs. Referring Expressions
%A Coppock, Elizabeth
%A Dionne, Danielle
%A Graham, Nathanial
%A Ganem, Elias
%A Zhao, Shijie
%A Lin, Shawn
%A Liu, Wenxing
%A Wijaya, Derry
%Y Howes, Christine
%Y Chatzikyriakidis, Stergios
%Y Ek, Adam
%Y Somashekarappa, Vidya
%S Proceedings of the Probability and Meaning Conference (PaM 2020)
%D 2020
%8 June
%I Association for Computational Linguistics
%C Gothenburg
%F coppock-etal-2020-informativity
%X At the intersection between computer vision and natural language processing, there has been recent progress on two natural language generation tasks: Dense Image Captioning and Referring Expression Generation for objects in complex scenes. The former aims to provide a caption for a specified object in a complex scene for the benefit of an interlocutor who may not be able to see it. The latter aims to produce a referring expression that will serve to identify a given object in a scene that the interlocutor can see. The two tasks are designed for different assumptions about the common ground between the interlocutors, and serve very different purposes, although they both associate a linguistic description with an object in a complex scene. Despite these fundamental differences, the distinction between these two tasks is sometimes overlooked. Here, we undertake a side-by-side comparison between image captioning and reference game human datasets and show that they differ systematically with respect to informativity. We hope that an understanding of the systematic differences among these human datasets will ultimately allow them to be leveraged more effectively in the associated engineering tasks.
%U https://aclanthology.org/2020.pam-1.14
%P 104-108
Markdown (Informal)
[Informativity in Image Captions vs. Referring Expressions](https://aclanthology.org/2020.pam-1.14) (Coppock et al., PaM 2020)
ACL
- Elizabeth Coppock, Danielle Dionne, Nathanial Graham, Elias Ganem, Shijie Zhao, Shawn Lin, Wenxing Liu, and Derry Wijaya. 2020. Informativity in Image Captions vs. Referring Expressions. In Proceedings of the Probability and Meaning Conference (PaM 2020), pages 104–108, Gothenburg. Association for Computational Linguistics.