@inproceedings{tanaka-etal-2024-content,
title = "Content-Specific Humorous Image Captioning Using Incongruity Resolution Chain-of-Thought",
author = "Tanaka, Kohtaro and
Uehara, Kohei and
Gu, Lin and
Mukuta, Yusuke and
Harada, Tatsuya",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.152",
doi = "10.18653/v1/2024.findings-naacl.152",
pages = "2348--2367",
abstract = "Although automated image captioning methods have benefited considerably from the development of large language models (LLMs), generating humorous captions is still a challenging task. Humorous captions generated by humans are unique to the image and reflect the content of the image. However, captions generated using previous captioning models tend to be generic. Therefore, we propose incongruity-resolution chain-of-thought (IRCoT) as a novel prompting framework that creates content-specific resolutions from fine details extracted from an image. Furthermore, we integrate logit bias and negative sampling to suppress the output of generic resolutions. The results of experiments with GPT4-V demonstrate that our proposed framework effectively generated humorous captions tailored to the content of specific input images.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tanaka-etal-2024-content">
<titleInfo>
<title>Content-Specific Humorous Image Captioning Using Incongruity Resolution Chain-of-Thought</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kohtaro</namePart>
<namePart type="family">Tanaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kohei</namePart>
<namePart type="family">Uehara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Mukuta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Harada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Although automated image captioning methods have benefited considerably from the development of large language models (LLMs), generating humorous captions is still a challenging task. Humorous captions generated by humans are unique to the image and reflect the content of the image. However, captions generated using previous captioning models tend to be generic. Therefore, we propose incongruity-resolution chain-of-thought (IRCoT) as a novel prompting framework that creates content-specific resolutions from fine details extracted from an image. Furthermore, we integrate logit bias and negative sampling to suppress the output of generic resolutions. The results of experiments with GPT4-V demonstrate that our proposed framework effectively generated humorous captions tailored to the content of specific input images.</abstract>
<identifier type="citekey">tanaka-etal-2024-content</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.152</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.152</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>2348</start>
<end>2367</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Content-Specific Humorous Image Captioning Using Incongruity Resolution Chain-of-Thought
%A Tanaka, Kohtaro
%A Uehara, Kohei
%A Gu, Lin
%A Mukuta, Yusuke
%A Harada, Tatsuya
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F tanaka-etal-2024-content
%X Although automated image captioning methods have benefited considerably from the development of large language models (LLMs), generating humorous captions is still a challenging task. Humorous captions generated by humans are unique to the image and reflect the content of the image. However, captions generated using previous captioning models tend to be generic. Therefore, we propose incongruity-resolution chain-of-thought (IRCoT) as a novel prompting framework that creates content-specific resolutions from fine details extracted from an image. Furthermore, we integrate logit bias and negative sampling to suppress the output of generic resolutions. The results of experiments with GPT4-V demonstrate that our proposed framework effectively generated humorous captions tailored to the content of specific input images.
%R 10.18653/v1/2024.findings-naacl.152
%U https://aclanthology.org/2024.findings-naacl.152
%U https://doi.org/10.18653/v1/2024.findings-naacl.152
%P 2348-2367
Markdown (Informal)
[Content-Specific Humorous Image Captioning Using Incongruity Resolution Chain-of-Thought](https://aclanthology.org/2024.findings-naacl.152) (Tanaka et al., Findings 2024)
ACL