@inproceedings{chan-etal-2023-ic3,
title = "{IC}3: Image Captioning by Committee Consensus",
author = "Chan, David and
Myers, Austin and
Vijayanarasimhan, Sudheendra and
Ross, David and
Canny, John",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.556",
doi = "10.18653/v1/2023.emnlp-main.556",
pages = "8975--9003",
abstract = "If you ask a human to describe an image, they might do so in a thousand different ways. Traditionally, image captioning models are trained to generate a single {``}best{'} (most like a reference) image caption. Unfortunately, doing so encourages captions that are {``}informationally impoverished,{'} and focus on only a subset of the possible details, while ignoring other potentially useful information in the scene. In this work, we introduce a simple, yet novel, method: {``}Image Captioning by Committee Consensus{'} (IC3), designed to generate a single caption that captures high-level details from several annotator viewpoints. Humans rate captions produced by IC3 at least as helpful as baseline SOTA models more than two thirds of the time, and IC3 can improve the performance of SOTA automated recall systems by up to 84{\%}, outperforming single human-generated reference captions, and indicating significant improvements over SOTA approaches for visual description. Code is available at [https://davidmchan.github.io/caption-by-committee/](https://davidmchan.github.io/caption-by-committee/)",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chan-etal-2023-ic3">
<titleInfo>
<title>IC3: Image Captioning by Committee Consensus</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Chan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Austin</namePart>
<namePart type="family">Myers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudheendra</namePart>
<namePart type="family">Vijayanarasimhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Canny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>If you ask a human to describe an image, they might do so in a thousand different ways. Traditionally, image captioning models are trained to generate a single “best’ (most like a reference) image caption. Unfortunately, doing so encourages captions that are “informationally impoverished,’ and focus on only a subset of the possible details, while ignoring other potentially useful information in the scene. In this work, we introduce a simple, yet novel, method: “Image Captioning by Committee Consensus’ (IC3), designed to generate a single caption that captures high-level details from several annotator viewpoints. Humans rate captions produced by IC3 at least as helpful as baseline SOTA models more than two thirds of the time, and IC3 can improve the performance of SOTA automated recall systems by up to 84%, outperforming single human-generated reference captions, and indicating significant improvements over SOTA approaches for visual description. Code is available at [https://davidmchan.github.io/caption-by-committee/](https://davidmchan.github.io/caption-by-committee/)</abstract>
<identifier type="citekey">chan-etal-2023-ic3</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.556</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.556</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>8975</start>
<end>9003</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IC3: Image Captioning by Committee Consensus
%A Chan, David
%A Myers, Austin
%A Vijayanarasimhan, Sudheendra
%A Ross, David
%A Canny, John
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F chan-etal-2023-ic3
%X If you ask a human to describe an image, they might do so in a thousand different ways. Traditionally, image captioning models are trained to generate a single “best’ (most like a reference) image caption. Unfortunately, doing so encourages captions that are “informationally impoverished,’ and focus on only a subset of the possible details, while ignoring other potentially useful information in the scene. In this work, we introduce a simple, yet novel, method: “Image Captioning by Committee Consensus’ (IC3), designed to generate a single caption that captures high-level details from several annotator viewpoints. Humans rate captions produced by IC3 at least as helpful as baseline SOTA models more than two thirds of the time, and IC3 can improve the performance of SOTA automated recall systems by up to 84%, outperforming single human-generated reference captions, and indicating significant improvements over SOTA approaches for visual description. Code is available at [https://davidmchan.github.io/caption-by-committee/](https://davidmchan.github.io/caption-by-committee/)
%R 10.18653/v1/2023.emnlp-main.556
%U https://aclanthology.org/2023.emnlp-main.556
%U https://doi.org/10.18653/v1/2023.emnlp-main.556
%P 8975-9003
Markdown (Informal)
[IC3: Image Captioning by Committee Consensus](https://aclanthology.org/2023.emnlp-main.556) (Chan et al., EMNLP 2023)
ACL
- David Chan, Austin Myers, Sudheendra Vijayanarasimhan, David Ross, and John Canny. 2023. IC3: Image Captioning by Committee Consensus. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 8975–9003, Singapore. Association for Computational Linguistics.