@inproceedings{kim-ji-2024-finer,
title = "Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models",
author = "Kim, Jeonghwan and
Ji, Heng",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.356",
doi = "10.18653/v1/2024.emnlp-main.356",
pages = "6187--6207",
abstract = "Recent advances in instruction-tuned Large Vision-Language Models (LVLMs) have imbued the models with the ability to generate high-level, image-grounded explanations with ease. While such capability is largely attributed to the rich world knowledge contained within the Large Language Models (LLMs), our work reveals their shortcomings in fine-grained visual categorization (FGVC) across six different benchmark settings. Most recent state-of-the-art LVLMs such as LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs for LLaVA-1.5, but also struggle to generate descriptive visual attributes based on a concept that appears within an input image despite their prominent zero-shot image captioning ability. In-depth analyses show that instruction-tuned LVLMs suffer from modality gap, showing discrepancy when given textual and visual inputs that correspond to the same concept. In an effort to further the community{'}s endeavor in this direction, we propose a multiple granularity attribute-centric benchmark and training mixture, Finer, which aims to establish a ground to evaluate LVLMs{'} fine-grained visual comprehension ability and provide significantly improved explainability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-ji-2024-finer">
<titleInfo>
<title>Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeonghwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advances in instruction-tuned Large Vision-Language Models (LVLMs) have imbued the models with the ability to generate high-level, image-grounded explanations with ease. While such capability is largely attributed to the rich world knowledge contained within the Large Language Models (LLMs), our work reveals their shortcomings in fine-grained visual categorization (FGVC) across six different benchmark settings. Most recent state-of-the-art LVLMs such as LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs for LLaVA-1.5, but also struggle to generate descriptive visual attributes based on a concept that appears within an input image despite their prominent zero-shot image captioning ability. In-depth analyses show that instruction-tuned LVLMs suffer from modality gap, showing discrepancy when given textual and visual inputs that correspond to the same concept. In an effort to further the community’s endeavor in this direction, we propose a multiple granularity attribute-centric benchmark and training mixture, Finer, which aims to establish a ground to evaluate LVLMs’ fine-grained visual comprehension ability and provide significantly improved explainability.</abstract>
<identifier type="citekey">kim-ji-2024-finer</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.356</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.356</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>6187</start>
<end>6207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models
%A Kim, Jeonghwan
%A Ji, Heng
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kim-ji-2024-finer
%X Recent advances in instruction-tuned Large Vision-Language Models (LVLMs) have imbued the models with the ability to generate high-level, image-grounded explanations with ease. While such capability is largely attributed to the rich world knowledge contained within the Large Language Models (LLMs), our work reveals their shortcomings in fine-grained visual categorization (FGVC) across six different benchmark settings. Most recent state-of-the-art LVLMs such as LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs for LLaVA-1.5, but also struggle to generate descriptive visual attributes based on a concept that appears within an input image despite their prominent zero-shot image captioning ability. In-depth analyses show that instruction-tuned LVLMs suffer from modality gap, showing discrepancy when given textual and visual inputs that correspond to the same concept. In an effort to further the community’s endeavor in this direction, we propose a multiple granularity attribute-centric benchmark and training mixture, Finer, which aims to establish a ground to evaluate LVLMs’ fine-grained visual comprehension ability and provide significantly improved explainability.
%R 10.18653/v1/2024.emnlp-main.356
%U https://aclanthology.org/2024.emnlp-main.356
%U https://doi.org/10.18653/v1/2024.emnlp-main.356
%P 6187-6207
Markdown (Informal)
[Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models](https://aclanthology.org/2024.emnlp-main.356) (Kim & Ji, EMNLP 2024)
ACL