@inproceedings{you-etal-2025-cross,
title = "Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning",
author = "You, Jingyi and
Sasaki, Hiroshi and
Kadowaki, Kazuma",
editor = "Kriz, Reno and
Murray, Kenton",
booktitle = "Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.magmar-1.4/",
doi = "10.18653/v1/2025.magmar-1.4",
pages = "47--58",
ISBN = "979-8-89176-280-0",
abstract = "Recent advances in retrieval-augmented generative image captioning (RAG-IC) have significantly improved caption quality by incorporating external knowledge and similar examples into language model-driven caption generators. However, these methods still encounter challenges when applied to real-world scenarios. First, many existing approaches rely on bimodal retrieval datastores that require large amounts of labeled data and substantial manual effort to construct, making them costly and time-consuming. Moreover, they simply retrieve the nearest samples to the input query from datastores, which leads to high redundancy in the retrieved content and subsequently degrades the quality of the generated captions. In this paper, we introduce a novel RAG-IC approach named \textit{ \textbf{C}r\textbf{o}ss-modal \textbf{Di}versity-promoting \textbf{Ret}rieval technique} (CoDiRet), which integrates a text-only unimodal retrieval module with our unique cluster-based retrieval mechanism. This proposal simultaneously enhances the scalability of the datastore, promotes diversity in retrieved content, and improves robustness against out-of-domain inputs, which eventually facilitates real-world applications. Experimental results demonstrate that our method, despite being exclusively trained on the COCO benchmark dataset, achieves competitive performance on the in-domain benchmark and generalizes robustly across different domains without additional training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="you-etal-2025-cross">
<titleInfo>
<title>Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jingyi</namePart>
<namePart type="family">You</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroshi</namePart>
<namePart type="family">Sasaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazuma</namePart>
<namePart type="family">Kadowaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-280-0</identifier>
</relatedItem>
<abstract>Recent advances in retrieval-augmented generative image captioning (RAG-IC) have significantly improved caption quality by incorporating external knowledge and similar examples into language model-driven caption generators. However, these methods still encounter challenges when applied to real-world scenarios. First, many existing approaches rely on bimodal retrieval datastores that require large amounts of labeled data and substantial manual effort to construct, making them costly and time-consuming. Moreover, they simply retrieve the nearest samples to the input query from datastores, which leads to high redundancy in the retrieved content and subsequently degrades the quality of the generated captions. In this paper, we introduce a novel RAG-IC approach named Cross-modal Diversity-promoting Retrieval technique (CoDiRet), which integrates a text-only unimodal retrieval module with our unique cluster-based retrieval mechanism. This proposal simultaneously enhances the scalability of the datastore, promotes diversity in retrieved content, and improves robustness against out-of-domain inputs, which eventually facilitates real-world applications. Experimental results demonstrate that our method, despite being exclusively trained on the COCO benchmark dataset, achieves competitive performance on the in-domain benchmark and generalizes robustly across different domains without additional training.</abstract>
<identifier type="citekey">you-etal-2025-cross</identifier>
<identifier type="doi">10.18653/v1/2025.magmar-1.4</identifier>
<location>
<url>https://aclanthology.org/2025.magmar-1.4/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>47</start>
<end>58</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning
%A You, Jingyi
%A Sasaki, Hiroshi
%A Kadowaki, Kazuma
%Y Kriz, Reno
%Y Murray, Kenton
%S Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-280-0
%F you-etal-2025-cross
%X Recent advances in retrieval-augmented generative image captioning (RAG-IC) have significantly improved caption quality by incorporating external knowledge and similar examples into language model-driven caption generators. However, these methods still encounter challenges when applied to real-world scenarios. First, many existing approaches rely on bimodal retrieval datastores that require large amounts of labeled data and substantial manual effort to construct, making them costly and time-consuming. Moreover, they simply retrieve the nearest samples to the input query from datastores, which leads to high redundancy in the retrieved content and subsequently degrades the quality of the generated captions. In this paper, we introduce a novel RAG-IC approach named Cross-modal Diversity-promoting Retrieval technique (CoDiRet), which integrates a text-only unimodal retrieval module with our unique cluster-based retrieval mechanism. This proposal simultaneously enhances the scalability of the datastore, promotes diversity in retrieved content, and improves robustness against out-of-domain inputs, which eventually facilitates real-world applications. Experimental results demonstrate that our method, despite being exclusively trained on the COCO benchmark dataset, achieves competitive performance on the in-domain benchmark and generalizes robustly across different domains without additional training.
%R 10.18653/v1/2025.magmar-1.4
%U https://aclanthology.org/2025.magmar-1.4/
%U https://doi.org/10.18653/v1/2025.magmar-1.4
%P 47-58
Markdown (Informal)
[Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning](https://aclanthology.org/2025.magmar-1.4/) (You et al., MAGMaR 2025)
ACL