@inproceedings{manevich-tsarfaty-2025-making,
title = "Making {LVLM}s Look Twice: Contrastive Decoding with Contrast Images",
author = "Manevich, Avshalom and
Tsarfaty, Reut",
editor = "Kriz, Reno and
Murray, Kenton",
booktitle = "Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.magmar-1.6/",
doi = "10.18653/v1/2025.magmar-1.6",
pages = "65--78",
ISBN = "979-8-89176-280-0",
abstract = "Large Vision-Language Models (LVLMs) are becoming increasingly popular for text-vision tasks requiring cross-modal reasoning, but often struggle with fine-grained visual discrimination. This limitation is evident in recent benchmarks like NaturalBench and D3, where closed models such as GPT-4o achieve only 39.6{\%}, and open-source models perform below random chance (25{\%}). We introduce Contrastive decoding with Contrast Images (CoCI), which adjusts LVLM outputs by contrasting them against outputs for similar images (Contrast Images - CIs). CoCI demonstrates strong performance across three distinct supervision regimes. First, when using naturally occurring CIs in benchmarks with curated image pairs, we achieve improvements of up to 98.9{\%} on NaturalBench, 69.5{\%} on D3, and 37.6{\%} on MMVP. Second, for scenarios with modest training data ({\textasciitilde}5k samples), we show that a lightweight neural classifier can effectively select CIs from similar images at inference time, improving NaturalBench performance by up to 36.8{\%}. Third, for scenarios with no training data, we develop a caption-matching technique that selects CIs by comparing LVLM-generated descriptions of candidate images. Notably, on VQAv2, our method improves VQA performance even in pointwise evaluation settings without explicit contrast images. Our approach demonstrates the potential for enhancing LVLMs at inference time through different CI selection approaches, each suited to different data availability scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="manevich-tsarfaty-2025-making">
<titleInfo>
<title>Making LVLMs Look Twice: Contrastive Decoding with Contrast Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Avshalom</namePart>
<namePart type="family">Manevich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-280-0</identifier>
</relatedItem>
<abstract>Large Vision-Language Models (LVLMs) are becoming increasingly popular for text-vision tasks requiring cross-modal reasoning, but often struggle with fine-grained visual discrimination. This limitation is evident in recent benchmarks like NaturalBench and D3, where closed models such as GPT-4o achieve only 39.6%, and open-source models perform below random chance (25%). We introduce Contrastive decoding with Contrast Images (CoCI), which adjusts LVLM outputs by contrasting them against outputs for similar images (Contrast Images - CIs). CoCI demonstrates strong performance across three distinct supervision regimes. First, when using naturally occurring CIs in benchmarks with curated image pairs, we achieve improvements of up to 98.9% on NaturalBench, 69.5% on D3, and 37.6% on MMVP. Second, for scenarios with modest training data (~5k samples), we show that a lightweight neural classifier can effectively select CIs from similar images at inference time, improving NaturalBench performance by up to 36.8%. Third, for scenarios with no training data, we develop a caption-matching technique that selects CIs by comparing LVLM-generated descriptions of candidate images. Notably, on VQAv2, our method improves VQA performance even in pointwise evaluation settings without explicit contrast images. Our approach demonstrates the potential for enhancing LVLMs at inference time through different CI selection approaches, each suited to different data availability scenarios.</abstract>
<identifier type="citekey">manevich-tsarfaty-2025-making</identifier>
<identifier type="doi">10.18653/v1/2025.magmar-1.6</identifier>
<location>
<url>https://aclanthology.org/2025.magmar-1.6/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>65</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Making LVLMs Look Twice: Contrastive Decoding with Contrast Images
%A Manevich, Avshalom
%A Tsarfaty, Reut
%Y Kriz, Reno
%Y Murray, Kenton
%S Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-280-0
%F manevich-tsarfaty-2025-making
%X Large Vision-Language Models (LVLMs) are becoming increasingly popular for text-vision tasks requiring cross-modal reasoning, but often struggle with fine-grained visual discrimination. This limitation is evident in recent benchmarks like NaturalBench and D3, where closed models such as GPT-4o achieve only 39.6%, and open-source models perform below random chance (25%). We introduce Contrastive decoding with Contrast Images (CoCI), which adjusts LVLM outputs by contrasting them against outputs for similar images (Contrast Images - CIs). CoCI demonstrates strong performance across three distinct supervision regimes. First, when using naturally occurring CIs in benchmarks with curated image pairs, we achieve improvements of up to 98.9% on NaturalBench, 69.5% on D3, and 37.6% on MMVP. Second, for scenarios with modest training data (~5k samples), we show that a lightweight neural classifier can effectively select CIs from similar images at inference time, improving NaturalBench performance by up to 36.8%. Third, for scenarios with no training data, we develop a caption-matching technique that selects CIs by comparing LVLM-generated descriptions of candidate images. Notably, on VQAv2, our method improves VQA performance even in pointwise evaluation settings without explicit contrast images. Our approach demonstrates the potential for enhancing LVLMs at inference time through different CI selection approaches, each suited to different data availability scenarios.
%R 10.18653/v1/2025.magmar-1.6
%U https://aclanthology.org/2025.magmar-1.6/
%U https://doi.org/10.18653/v1/2025.magmar-1.6
%P 65-78
Markdown (Informal)
[Making LVLMs Look Twice: Contrastive Decoding with Contrast Images](https://aclanthology.org/2025.magmar-1.6/) (Manevich & Tsarfaty, MAGMaR 2025)
ACL