@inproceedings{huang-etal-2025-image,
title = "Image Difference Captioning via Adversarial Preference Optimization",
author = "Huang, Zihan and
Wu, Junda and
Surana, Rohan and
Yu, Tong and
Arbour, David and
Sinha, Ritwik and
McAuley, Julian",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1713/",
doi = "10.18653/v1/2025.emnlp-main.1713",
pages = "33758--33770",
ISBN = "979-8-89176-332-6",
abstract = "Image Difference Captioning (IDC) aims to generate natural language descriptions that highlight subtle differences between two visually similar images. While recent advances leverage pre-trained vision-language models to align fine-grained visual differences with textual semantics, existing supervised approaches often overfit to dataset-specific language patterns and fail to capture accurate preferences on IDC, which often indicates fine-grained and context-aware distinctions. To address these limitations, we propose an adversarial direct preference optimization (ADPO) framework for IDC, which formulates IDC as a preference optimization problem under the Bradley-Terry-Luce model, directly aligning the captioning policy with pairwise difference preferences via Direct Preference Optimization (DPO). To model more accurate and diverse IDC preferences, we introduce an adversarially trained hard negative retriever that selects counterfactual captions, This results in a minimax optimization problem, which we solve via policy-gradient reinforcement learning, enabling the policy and retriever to improve jointly. Experiments on benchmark IDC datasets show that our approach outperforms existing baselines, especially in generating fine-grained and accurate difference descriptions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2025-image">
<titleInfo>
<title>Image Difference Captioning via Adversarial Preference Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junda</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohan</namePart>
<namePart type="family">Surana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Arbour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritwik</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">McAuley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Image Difference Captioning (IDC) aims to generate natural language descriptions that highlight subtle differences between two visually similar images. While recent advances leverage pre-trained vision-language models to align fine-grained visual differences with textual semantics, existing supervised approaches often overfit to dataset-specific language patterns and fail to capture accurate preferences on IDC, which often indicates fine-grained and context-aware distinctions. To address these limitations, we propose an adversarial direct preference optimization (ADPO) framework for IDC, which formulates IDC as a preference optimization problem under the Bradley-Terry-Luce model, directly aligning the captioning policy with pairwise difference preferences via Direct Preference Optimization (DPO). To model more accurate and diverse IDC preferences, we introduce an adversarially trained hard negative retriever that selects counterfactual captions, This results in a minimax optimization problem, which we solve via policy-gradient reinforcement learning, enabling the policy and retriever to improve jointly. Experiments on benchmark IDC datasets show that our approach outperforms existing baselines, especially in generating fine-grained and accurate difference descriptions.</abstract>
<identifier type="citekey">huang-etal-2025-image</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1713</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1713/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>33758</start>
<end>33770</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Image Difference Captioning via Adversarial Preference Optimization
%A Huang, Zihan
%A Wu, Junda
%A Surana, Rohan
%A Yu, Tong
%A Arbour, David
%A Sinha, Ritwik
%A McAuley, Julian
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F huang-etal-2025-image
%X Image Difference Captioning (IDC) aims to generate natural language descriptions that highlight subtle differences between two visually similar images. While recent advances leverage pre-trained vision-language models to align fine-grained visual differences with textual semantics, existing supervised approaches often overfit to dataset-specific language patterns and fail to capture accurate preferences on IDC, which often indicates fine-grained and context-aware distinctions. To address these limitations, we propose an adversarial direct preference optimization (ADPO) framework for IDC, which formulates IDC as a preference optimization problem under the Bradley-Terry-Luce model, directly aligning the captioning policy with pairwise difference preferences via Direct Preference Optimization (DPO). To model more accurate and diverse IDC preferences, we introduce an adversarially trained hard negative retriever that selects counterfactual captions, This results in a minimax optimization problem, which we solve via policy-gradient reinforcement learning, enabling the policy and retriever to improve jointly. Experiments on benchmark IDC datasets show that our approach outperforms existing baselines, especially in generating fine-grained and accurate difference descriptions.
%R 10.18653/v1/2025.emnlp-main.1713
%U https://aclanthology.org/2025.emnlp-main.1713/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1713
%P 33758-33770
Markdown (Informal)
[Image Difference Captioning via Adversarial Preference Optimization](https://aclanthology.org/2025.emnlp-main.1713/) (Huang et al., EMNLP 2025)
ACL
- Zihan Huang, Junda Wu, Rohan Surana, Tong Yu, David Arbour, Ritwik Sinha, and Julian McAuley. 2025. Image Difference Captioning via Adversarial Preference Optimization. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 33758–33770, Suzhou, China. Association for Computational Linguistics.