@inproceedings{guo-etal-2025-learning-describe,
title = "Learning to Describe Implicit Changes: Noise-robust Pre-training for Image Difference Captioning",
author = "Guo, Zixin and
Sun, Jiayang and
Wang, Tzu-Jui Julius and
Radman, Abduljalil and
Pehlivan, Selen and
Cao, Min and
Laaksonen, Jorma",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.537/",
doi = "10.18653/v1/2025.findings-emnlp.537",
pages = "10125--10145",
ISBN = "979-8-89176-335-7",
abstract = "Image Difference Captioning (IDC) methods have advanced in highlighting subtle differences between similar images, but their performance is often constrained by limited training data. Using Large Multimodal Models (LMMs) to describe changes in image pairs mitigates data limits but adds noise. These change descriptions are often coarse summaries, obscuring fine details and hindering noise detection. In this work, we improve IDC with a noise-robust approach at both data and model levels. We use LMMs with structured prompts to generate fine-grained change descriptions during data curation. We propose a Noise-Aware Modeling and Captioning (NAMC) model with three modules: Noise Identification and Masking (NIM) to reduce noisy correspondences, Masked Image Reconstruction (MIR) to correct over-masking errors, and Fine-grained Description Generation (FDG) to produce coherent change descriptions. Experiments on four IDC benchmarks show that NAMC, pre-trained on our large-scale data, outperforms streamlined architectures and achieves competitive performance with LLM-finetuned methods, offering better inference efficiency."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2025-learning-describe">
<titleInfo>
<title>Learning to Describe Implicit Changes: Noise-robust Pre-training for Image Difference Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zixin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tzu-Jui</namePart>
<namePart type="given">Julius</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abduljalil</namePart>
<namePart type="family">Radman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Selen</namePart>
<namePart type="family">Pehlivan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorma</namePart>
<namePart type="family">Laaksonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Image Difference Captioning (IDC) methods have advanced in highlighting subtle differences between similar images, but their performance is often constrained by limited training data. Using Large Multimodal Models (LMMs) to describe changes in image pairs mitigates data limits but adds noise. These change descriptions are often coarse summaries, obscuring fine details and hindering noise detection. In this work, we improve IDC with a noise-robust approach at both data and model levels. We use LMMs with structured prompts to generate fine-grained change descriptions during data curation. We propose a Noise-Aware Modeling and Captioning (NAMC) model with three modules: Noise Identification and Masking (NIM) to reduce noisy correspondences, Masked Image Reconstruction (MIR) to correct over-masking errors, and Fine-grained Description Generation (FDG) to produce coherent change descriptions. Experiments on four IDC benchmarks show that NAMC, pre-trained on our large-scale data, outperforms streamlined architectures and achieves competitive performance with LLM-finetuned methods, offering better inference efficiency.</abstract>
<identifier type="citekey">guo-etal-2025-learning-describe</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.537</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.537/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10125</start>
<end>10145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Describe Implicit Changes: Noise-robust Pre-training for Image Difference Captioning
%A Guo, Zixin
%A Sun, Jiayang
%A Wang, Tzu-Jui Julius
%A Radman, Abduljalil
%A Pehlivan, Selen
%A Cao, Min
%A Laaksonen, Jorma
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F guo-etal-2025-learning-describe
%X Image Difference Captioning (IDC) methods have advanced in highlighting subtle differences between similar images, but their performance is often constrained by limited training data. Using Large Multimodal Models (LMMs) to describe changes in image pairs mitigates data limits but adds noise. These change descriptions are often coarse summaries, obscuring fine details and hindering noise detection. In this work, we improve IDC with a noise-robust approach at both data and model levels. We use LMMs with structured prompts to generate fine-grained change descriptions during data curation. We propose a Noise-Aware Modeling and Captioning (NAMC) model with three modules: Noise Identification and Masking (NIM) to reduce noisy correspondences, Masked Image Reconstruction (MIR) to correct over-masking errors, and Fine-grained Description Generation (FDG) to produce coherent change descriptions. Experiments on four IDC benchmarks show that NAMC, pre-trained on our large-scale data, outperforms streamlined architectures and achieves competitive performance with LLM-finetuned methods, offering better inference efficiency.
%R 10.18653/v1/2025.findings-emnlp.537
%U https://aclanthology.org/2025.findings-emnlp.537/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.537
%P 10125-10145
Markdown (Informal)
[Learning to Describe Implicit Changes: Noise-robust Pre-training for Image Difference Captioning](https://aclanthology.org/2025.findings-emnlp.537/) (Guo et al., Findings 2025)
ACL