@inproceedings{yanuka-etal-2025-bridging,
title = "Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions",
author = "Yanuka, Moran and
Ben-Kish, Assaf and
Bitton, Yonatan and
Szpektor, Idan and
Giryes, Raja",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.527/",
doi = "10.18653/v1/2025.naacl-long.527",
pages = "10497--10518",
ISBN = "979-8-89176-189-6",
abstract = "Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation framework that breaks down generated captions into individual propositions, assessing each in isolation. This fine-grained analysis reveals a critical balance between capturing descriptive details and preventing hallucinations. Our findings show that simply reducing caption complexity or employing standard data curation techniques does not effectively resolve this issue. To tackle this challenge, we introduce Knowledge Adapted (KnowAda) fine-tuning, a data-centric approach that automatically adapts training data with the model{'}s existing knowledge and visual understanding. KnowAda minimizes hallucinations while preserving high descriptiveness. We validate this approach across several small-scale VLMs (up to 7B parameters) and dense caption datasets, demonstrating that KnowAda effectively balances hallucination reduction and descriptiveness. Our results show that KnowAda outperforms various baselines in both automatic metrics and human evaluations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yanuka-etal-2025-bridging">
<titleInfo>
<title>Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Moran</namePart>
<namePart type="family">Yanuka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Assaf</namePart>
<namePart type="family">Ben-Kish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Bitton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idan</namePart>
<namePart type="family">Szpektor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raja</namePart>
<namePart type="family">Giryes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation framework that breaks down generated captions into individual propositions, assessing each in isolation. This fine-grained analysis reveals a critical balance between capturing descriptive details and preventing hallucinations. Our findings show that simply reducing caption complexity or employing standard data curation techniques does not effectively resolve this issue. To tackle this challenge, we introduce Knowledge Adapted (KnowAda) fine-tuning, a data-centric approach that automatically adapts training data with the model’s existing knowledge and visual understanding. KnowAda minimizes hallucinations while preserving high descriptiveness. We validate this approach across several small-scale VLMs (up to 7B parameters) and dense caption datasets, demonstrating that KnowAda effectively balances hallucination reduction and descriptiveness. Our results show that KnowAda outperforms various baselines in both automatic metrics and human evaluations.</abstract>
<identifier type="citekey">yanuka-etal-2025-bridging</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.527</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.527/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>10497</start>
<end>10518</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions
%A Yanuka, Moran
%A Ben-Kish, Assaf
%A Bitton, Yonatan
%A Szpektor, Idan
%A Giryes, Raja
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F yanuka-etal-2025-bridging
%X Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation framework that breaks down generated captions into individual propositions, assessing each in isolation. This fine-grained analysis reveals a critical balance between capturing descriptive details and preventing hallucinations. Our findings show that simply reducing caption complexity or employing standard data curation techniques does not effectively resolve this issue. To tackle this challenge, we introduce Knowledge Adapted (KnowAda) fine-tuning, a data-centric approach that automatically adapts training data with the model’s existing knowledge and visual understanding. KnowAda minimizes hallucinations while preserving high descriptiveness. We validate this approach across several small-scale VLMs (up to 7B parameters) and dense caption datasets, demonstrating that KnowAda effectively balances hallucination reduction and descriptiveness. Our results show that KnowAda outperforms various baselines in both automatic metrics and human evaluations.
%R 10.18653/v1/2025.naacl-long.527
%U https://aclanthology.org/2025.naacl-long.527/
%U https://doi.org/10.18653/v1/2025.naacl-long.527
%P 10497-10518
Markdown (Informal)
[Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions](https://aclanthology.org/2025.naacl-long.527/) (Yanuka et al., NAACL 2025)
ACL
- Moran Yanuka, Assaf Ben-Kish, Yonatan Bitton, Idan Szpektor, and Raja Giryes. 2025. Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 10497–10518, Albuquerque, New Mexico. Association for Computational Linguistics.