@inproceedings{anantha-ramakrishnan-etal-2025-rona,
title = "{RONA}: Pragmatically Diverse Image Captioning with Coherence Relations",
author = "Anantha Ramakrishnan, Aashish and
Ramakrishnan, Aadarsh Anantha and
Lee, Dongwon",
editor = "Padmakumar, Vishakh and
Gero, Katy and
Wambsganss, Thiemo and
Sterman, Sarah and
Huang, Ting-Hao and
Zhou, David and
Chung, John",
booktitle = "Proceedings of the Fourth Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.in2writing-1.8/",
doi = "10.18653/v1/2025.in2writing-1.8",
pages = "74--86",
ISBN = "979-8-89176-239-8",
abstract = "Writing Assistants (e.g., Grammarly, Microsoft Copilot) traditionally generate diverse image captions by employing syntactic and semantic variations to describe image components. However, human-written captions prioritize conveying a central message alongside visual descriptions using pragmatic cues. To enhance caption diversity, it is essential to explore alternative ways of communicating these messages in conjunction with visual content. We propose RONA, a novel prompting strategy for Multi-modal Large Language Models (MLLM) that leverages Coherence Relations as a controllable axis for pragmatic variations. We demonstrate that RONA generates captions with better overall diversity and ground-truth alignment, compared to MLLM baselines across multiple domains. Our code is available at: https://github.com/aashish2000/RONA"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anantha-ramakrishnan-etal-2025-rona">
<titleInfo>
<title>RONA: Pragmatically Diverse Image Captioning with Coherence Relations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aashish</namePart>
<namePart type="family">Anantha Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aadarsh</namePart>
<namePart type="given">Anantha</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongwon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vishakh</namePart>
<namePart type="family">Padmakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katy</namePart>
<namePart type="family">Gero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thiemo</namePart>
<namePart type="family">Wambsganss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Sterman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-Hao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-239-8</identifier>
</relatedItem>
<abstract>Writing Assistants (e.g., Grammarly, Microsoft Copilot) traditionally generate diverse image captions by employing syntactic and semantic variations to describe image components. However, human-written captions prioritize conveying a central message alongside visual descriptions using pragmatic cues. To enhance caption diversity, it is essential to explore alternative ways of communicating these messages in conjunction with visual content. We propose RONA, a novel prompting strategy for Multi-modal Large Language Models (MLLM) that leverages Coherence Relations as a controllable axis for pragmatic variations. We demonstrate that RONA generates captions with better overall diversity and ground-truth alignment, compared to MLLM baselines across multiple domains. Our code is available at: https://github.com/aashish2000/RONA</abstract>
<identifier type="citekey">anantha-ramakrishnan-etal-2025-rona</identifier>
<identifier type="doi">10.18653/v1/2025.in2writing-1.8</identifier>
<location>
<url>https://aclanthology.org/2025.in2writing-1.8/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>74</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RONA: Pragmatically Diverse Image Captioning with Coherence Relations
%A Anantha Ramakrishnan, Aashish
%A Ramakrishnan, Aadarsh Anantha
%A Lee, Dongwon
%Y Padmakumar, Vishakh
%Y Gero, Katy
%Y Wambsganss, Thiemo
%Y Sterman, Sarah
%Y Huang, Ting-Hao
%Y Zhou, David
%Y Chung, John
%S Proceedings of the Fourth Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, US
%@ 979-8-89176-239-8
%F anantha-ramakrishnan-etal-2025-rona
%X Writing Assistants (e.g., Grammarly, Microsoft Copilot) traditionally generate diverse image captions by employing syntactic and semantic variations to describe image components. However, human-written captions prioritize conveying a central message alongside visual descriptions using pragmatic cues. To enhance caption diversity, it is essential to explore alternative ways of communicating these messages in conjunction with visual content. We propose RONA, a novel prompting strategy for Multi-modal Large Language Models (MLLM) that leverages Coherence Relations as a controllable axis for pragmatic variations. We demonstrate that RONA generates captions with better overall diversity and ground-truth alignment, compared to MLLM baselines across multiple domains. Our code is available at: https://github.com/aashish2000/RONA
%R 10.18653/v1/2025.in2writing-1.8
%U https://aclanthology.org/2025.in2writing-1.8/
%U https://doi.org/10.18653/v1/2025.in2writing-1.8
%P 74-86
Markdown (Informal)
[RONA: Pragmatically Diverse Image Captioning with Coherence Relations](https://aclanthology.org/2025.in2writing-1.8/) (Anantha Ramakrishnan et al., In2Writing 2025)
ACL
- Aashish Anantha Ramakrishnan, Aadarsh Anantha Ramakrishnan, and Dongwon Lee. 2025. RONA: Pragmatically Diverse Image Captioning with Coherence Relations. In Proceedings of the Fourth Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2025), pages 74–86, Albuquerque, New Mexico, US. Association for Computational Linguistics.