@inproceedings{cohn-gordon-etal-2018-pragmatically,
title = "Pragmatically Informative Image Captioning with Character-Level Inference",
author = "Cohn-Gordon, Reuben and
Goodman, Noah and
Potts, Christopher",
editor = "Walker, Marilyn and
Ji, Heng and
Stent, Amanda",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-2070/",
doi = "10.18653/v1/N18-2070",
pages = "439--443",
abstract = "We combine a neural image captioner with a Rational Speech Acts (RSA) model to make a system that is pragmatically informative: its objective is to produce captions that are not merely true but also distinguish their inputs from similar images. Previous attempts to combine RSA with neural image captioning require an inference which normalizes over the entire set of possible utterances. This poses a serious problem of efficiency, previously solved by sampling a small subset of possible utterances. We instead solve this problem by implementing a version of RSA which operates at the level of characters ({\textquotedblleft}a{\textquotedblright}, {\textquotedblleft}b{\textquotedblright}, {\textquotedblleft}c{\textquotedblright}, ...) during the unrolling of the caption. We find that the utterance-level effect of referential captions can be obtained with only character-level decisions. Finally, we introduce an automatic method for testing the performance of pragmatic speaker models, and show that our model outperforms a non-pragmatic baseline as well as a word-level RSA captioner."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cohn-gordon-etal-2018-pragmatically">
<titleInfo>
<title>Pragmatically Informative Image Captioning with Character-Level Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reuben</namePart>
<namePart type="family">Cohn-Gordon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Goodman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Potts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marilyn</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="family">Stent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Orleans, Louisiana</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We combine a neural image captioner with a Rational Speech Acts (RSA) model to make a system that is pragmatically informative: its objective is to produce captions that are not merely true but also distinguish their inputs from similar images. Previous attempts to combine RSA with neural image captioning require an inference which normalizes over the entire set of possible utterances. This poses a serious problem of efficiency, previously solved by sampling a small subset of possible utterances. We instead solve this problem by implementing a version of RSA which operates at the level of characters (“a”, “b”, “c”, ...) during the unrolling of the caption. We find that the utterance-level effect of referential captions can be obtained with only character-level decisions. Finally, we introduce an automatic method for testing the performance of pragmatic speaker models, and show that our model outperforms a non-pragmatic baseline as well as a word-level RSA captioner.</abstract>
<identifier type="citekey">cohn-gordon-etal-2018-pragmatically</identifier>
<identifier type="doi">10.18653/v1/N18-2070</identifier>
<location>
<url>https://aclanthology.org/N18-2070/</url>
</location>
<part>
<date>2018-06</date>
<extent unit="page">
<start>439</start>
<end>443</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pragmatically Informative Image Captioning with Character-Level Inference
%A Cohn-Gordon, Reuben
%A Goodman, Noah
%A Potts, Christopher
%Y Walker, Marilyn
%Y Ji, Heng
%Y Stent, Amanda
%S Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)
%D 2018
%8 June
%I Association for Computational Linguistics
%C New Orleans, Louisiana
%F cohn-gordon-etal-2018-pragmatically
%X We combine a neural image captioner with a Rational Speech Acts (RSA) model to make a system that is pragmatically informative: its objective is to produce captions that are not merely true but also distinguish their inputs from similar images. Previous attempts to combine RSA with neural image captioning require an inference which normalizes over the entire set of possible utterances. This poses a serious problem of efficiency, previously solved by sampling a small subset of possible utterances. We instead solve this problem by implementing a version of RSA which operates at the level of characters (“a”, “b”, “c”, ...) during the unrolling of the caption. We find that the utterance-level effect of referential captions can be obtained with only character-level decisions. Finally, we introduce an automatic method for testing the performance of pragmatic speaker models, and show that our model outperforms a non-pragmatic baseline as well as a word-level RSA captioner.
%R 10.18653/v1/N18-2070
%U https://aclanthology.org/N18-2070/
%U https://doi.org/10.18653/v1/N18-2070
%P 439-443
Markdown (Informal)
[Pragmatically Informative Image Captioning with Character-Level Inference](https://aclanthology.org/N18-2070/) (Cohn-Gordon et al., NAACL 2018)
ACL
- Reuben Cohn-Gordon, Noah Goodman, and Christopher Potts. 2018. Pragmatically Informative Image Captioning with Character-Level Inference. In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pages 439–443, New Orleans, Louisiana. Association for Computational Linguistics.