@inproceedings{shi-etal-2021-retrieval-analogy,
title = "Retrieval, Analogy, and Composition: A framework for Compositional Generalization in Image Captioning",
author = "Shi, Zhan and
Liu, Hui and
Min, Martin Renqiang and
Malon, Christopher and
Li, Li Erran and
Zhu, Xiaodan",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.171/",
doi = "10.18653/v1/2021.findings-emnlp.171",
pages = "1990--2000",
abstract = "Image captioning systems are expected to have the ability to combine individual concepts when describing scenes with concept combinations that are not observed during training. In spite of significant progress in image captioning with the help of the autoregressive generation framework, current approaches fail to generalize well to novel concept combinations. We propose a new framework that revolves around probing several similar image caption training instances (retrieval), performing analogical reasoning over relevant entities in retrieved prototypes (analogy), and enhancing the generation process with reasoning outcomes (composition). Our method augments the generation model by referring to the neighboring instances in the training set to produce novel concept combinations in generated captions. We perform experiments on the widely used image captioning benchmarks. The proposed models achieve substantial improvement over the compared baselines on both composition-related evaluation metrics and conventional image captioning metrics."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2021-retrieval-analogy">
<titleInfo>
<title>Retrieval, Analogy, and Composition: A framework for Compositional Generalization in Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhan</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="given">Renqiang</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Malon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="given">Erran</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaodan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image captioning systems are expected to have the ability to combine individual concepts when describing scenes with concept combinations that are not observed during training. In spite of significant progress in image captioning with the help of the autoregressive generation framework, current approaches fail to generalize well to novel concept combinations. We propose a new framework that revolves around probing several similar image caption training instances (retrieval), performing analogical reasoning over relevant entities in retrieved prototypes (analogy), and enhancing the generation process with reasoning outcomes (composition). Our method augments the generation model by referring to the neighboring instances in the training set to produce novel concept combinations in generated captions. We perform experiments on the widely used image captioning benchmarks. The proposed models achieve substantial improvement over the compared baselines on both composition-related evaluation metrics and conventional image captioning metrics.</abstract>
<identifier type="citekey">shi-etal-2021-retrieval-analogy</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.171</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.171/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>1990</start>
<end>2000</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Retrieval, Analogy, and Composition: A framework for Compositional Generalization in Image Captioning
%A Shi, Zhan
%A Liu, Hui
%A Min, Martin Renqiang
%A Malon, Christopher
%A Li, Li Erran
%A Zhu, Xiaodan
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F shi-etal-2021-retrieval-analogy
%X Image captioning systems are expected to have the ability to combine individual concepts when describing scenes with concept combinations that are not observed during training. In spite of significant progress in image captioning with the help of the autoregressive generation framework, current approaches fail to generalize well to novel concept combinations. We propose a new framework that revolves around probing several similar image caption training instances (retrieval), performing analogical reasoning over relevant entities in retrieved prototypes (analogy), and enhancing the generation process with reasoning outcomes (composition). Our method augments the generation model by referring to the neighboring instances in the training set to produce novel concept combinations in generated captions. We perform experiments on the widely used image captioning benchmarks. The proposed models achieve substantial improvement over the compared baselines on both composition-related evaluation metrics and conventional image captioning metrics.
%R 10.18653/v1/2021.findings-emnlp.171
%U https://aclanthology.org/2021.findings-emnlp.171/
%U https://doi.org/10.18653/v1/2021.findings-emnlp.171
%P 1990-2000
Markdown (Informal)
[Retrieval, Analogy, and Composition: A framework for Compositional Generalization in Image Captioning](https://aclanthology.org/2021.findings-emnlp.171/) (Shi et al., Findings 2021)
ACL
- Zhan Shi, Hui Liu, Martin Renqiang Min, Christopher Malon, Li Erran Li, and Xiaodan Zhu. 2021. Retrieval, Analogy, and Composition: A framework for Compositional Generalization in Image Captioning. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 1990–2000, Punta Cana, Dominican Republic. Association for Computational Linguistics.