@inproceedings{zhang-etal-2022-cross,
title = "Cross-Modal Similarity-Based Curriculum Learning for Image Captioning",
author = "Zhang, Hongkuan and
Sugawara, Saku and
Aizawa, Akiko and
Zhou, Lei and
Sasano, Ryohei and
Takeda, Koichi",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.516",
doi = "10.18653/v1/2022.emnlp-main.516",
pages = "7599--7606",
abstract = "Image captioning models require the high-level generalization ability to describe the contents of various images in words. Most existing approaches treat the image{--}caption pairs equally in their training without considering the differences in their learning difficulties. Several image captioning approaches introduce curriculum learning methods that present training data with increasing levels of difficulty. However, their difficulty measurements are either based on domain-specific features or prior model training. In this paper, we propose a simple yet efficient difficulty measurement for image captioning using cross-modal similarity calculated by a pretrained vision{--}language model. Experiments on the COCO and Flickr30k datasets show that our proposed approach achieves superior performance and competitive convergence speed to baselines without requiring heuristics or incurring additional training costs. Moreover, the higher model performance on difficult examples and unseen data also demonstrates the generalization ability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2022-cross">
<titleInfo>
<title>Cross-Modal Similarity-Based Curriculum Learning for Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongkuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saku</namePart>
<namePart type="family">Sugawara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Aizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryohei</namePart>
<namePart type="family">Sasano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichi</namePart>
<namePart type="family">Takeda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image captioning models require the high-level generalization ability to describe the contents of various images in words. Most existing approaches treat the image–caption pairs equally in their training without considering the differences in their learning difficulties. Several image captioning approaches introduce curriculum learning methods that present training data with increasing levels of difficulty. However, their difficulty measurements are either based on domain-specific features or prior model training. In this paper, we propose a simple yet efficient difficulty measurement for image captioning using cross-modal similarity calculated by a pretrained vision–language model. Experiments on the COCO and Flickr30k datasets show that our proposed approach achieves superior performance and competitive convergence speed to baselines without requiring heuristics or incurring additional training costs. Moreover, the higher model performance on difficult examples and unseen data also demonstrates the generalization ability.</abstract>
<identifier type="citekey">zhang-etal-2022-cross</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.516</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.516</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>7599</start>
<end>7606</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-Modal Similarity-Based Curriculum Learning for Image Captioning
%A Zhang, Hongkuan
%A Sugawara, Saku
%A Aizawa, Akiko
%A Zhou, Lei
%A Sasano, Ryohei
%A Takeda, Koichi
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F zhang-etal-2022-cross
%X Image captioning models require the high-level generalization ability to describe the contents of various images in words. Most existing approaches treat the image–caption pairs equally in their training without considering the differences in their learning difficulties. Several image captioning approaches introduce curriculum learning methods that present training data with increasing levels of difficulty. However, their difficulty measurements are either based on domain-specific features or prior model training. In this paper, we propose a simple yet efficient difficulty measurement for image captioning using cross-modal similarity calculated by a pretrained vision–language model. Experiments on the COCO and Flickr30k datasets show that our proposed approach achieves superior performance and competitive convergence speed to baselines without requiring heuristics or incurring additional training costs. Moreover, the higher model performance on difficult examples and unseen data also demonstrates the generalization ability.
%R 10.18653/v1/2022.emnlp-main.516
%U https://aclanthology.org/2022.emnlp-main.516
%U https://doi.org/10.18653/v1/2022.emnlp-main.516
%P 7599-7606
Markdown (Informal)
[Cross-Modal Similarity-Based Curriculum Learning for Image Captioning](https://aclanthology.org/2022.emnlp-main.516) (Zhang et al., EMNLP 2022)
ACL
- Hongkuan Zhang, Saku Sugawara, Akiko Aizawa, Lei Zhou, Ryohei Sasano, and Koichi Takeda. 2022. Cross-Modal Similarity-Based Curriculum Learning for Image Captioning. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pages 7599–7606, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.