@inproceedings{geigle-etal-2024-african,
title = "{A}frican or {E}uropean Swallow? Benchmarking Large Vision-Language Models for Fine-Grained Object Classification",
author = "Geigle, Gregor and
Timofte, Radu and
Glava{\v{s}}, Goran",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.154/",
doi = "10.18653/v1/2024.emnlp-main.154",
pages = "2653--2669",
abstract = "Recent Large Vision-Language Models (LVLMs) demonstrate impressive abilities on numerous image understanding and reasoning tasks. The task of fine-grained object classification (e.g., distinction between \textit{animal species}), however, has been probed insufficiently, despite its downstream importance. We fill this evaluation gap by creating FOCI (\textbf{F}ine-grained \textbf{O}bject \textbf{C}lass\textbf{I}fication), a difficult multiple-choice benchmark for fine-grained object classification, from existing object classification datasets: (1) multiple-choice avoids ambiguous answers associated with casting classification as open-ended QA task; (2) we retain classification difficulty by mining negative labels with a CLIP model. FOCI complements five popular classification datasets with four domain-specific subsets from ImageNet-21k. We benchmark 12 public LVLMs on and show that it tests for a \textit{complementary skill} to established image understanding and reasoning benchmarks. Crucially, CLIP models exhibit dramatically better performance than LVLMs. Since the image encoders of LVLMs come from these CLIP models, this points to inadequate alignment for fine-grained object distinction between the encoder and the LLM and warrants (pre)training data with more fine-grained annotation. We release our code at \url{ANONYMIZED}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="geigle-etal-2024-african">
<titleInfo>
<title>African or European Swallow? Benchmarking Large Vision-Language Models for Fine-Grained Object Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gregor</namePart>
<namePart type="family">Geigle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Timofte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Glavaš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent Large Vision-Language Models (LVLMs) demonstrate impressive abilities on numerous image understanding and reasoning tasks. The task of fine-grained object classification (e.g., distinction between animal species), however, has been probed insufficiently, despite its downstream importance. We fill this evaluation gap by creating FOCI (Fine-grained Object ClassIfication), a difficult multiple-choice benchmark for fine-grained object classification, from existing object classification datasets: (1) multiple-choice avoids ambiguous answers associated with casting classification as open-ended QA task; (2) we retain classification difficulty by mining negative labels with a CLIP model. FOCI complements five popular classification datasets with four domain-specific subsets from ImageNet-21k. We benchmark 12 public LVLMs on and show that it tests for a complementary skill to established image understanding and reasoning benchmarks. Crucially, CLIP models exhibit dramatically better performance than LVLMs. Since the image encoders of LVLMs come from these CLIP models, this points to inadequate alignment for fine-grained object distinction between the encoder and the LLM and warrants (pre)training data with more fine-grained annotation. We release our code at ANONYMIZED.</abstract>
<identifier type="citekey">geigle-etal-2024-african</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.154</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.154/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>2653</start>
<end>2669</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T African or European Swallow? Benchmarking Large Vision-Language Models for Fine-Grained Object Classification
%A Geigle, Gregor
%A Timofte, Radu
%A Glavaš, Goran
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F geigle-etal-2024-african
%X Recent Large Vision-Language Models (LVLMs) demonstrate impressive abilities on numerous image understanding and reasoning tasks. The task of fine-grained object classification (e.g., distinction between animal species), however, has been probed insufficiently, despite its downstream importance. We fill this evaluation gap by creating FOCI (Fine-grained Object ClassIfication), a difficult multiple-choice benchmark for fine-grained object classification, from existing object classification datasets: (1) multiple-choice avoids ambiguous answers associated with casting classification as open-ended QA task; (2) we retain classification difficulty by mining negative labels with a CLIP model. FOCI complements five popular classification datasets with four domain-specific subsets from ImageNet-21k. We benchmark 12 public LVLMs on and show that it tests for a complementary skill to established image understanding and reasoning benchmarks. Crucially, CLIP models exhibit dramatically better performance than LVLMs. Since the image encoders of LVLMs come from these CLIP models, this points to inadequate alignment for fine-grained object distinction between the encoder and the LLM and warrants (pre)training data with more fine-grained annotation. We release our code at ANONYMIZED.
%R 10.18653/v1/2024.emnlp-main.154
%U https://aclanthology.org/2024.emnlp-main.154/
%U https://doi.org/10.18653/v1/2024.emnlp-main.154
%P 2653-2669
Markdown (Informal)
[African or European Swallow? Benchmarking Large Vision-Language Models for Fine-Grained Object Classification](https://aclanthology.org/2024.emnlp-main.154/) (Geigle et al., EMNLP 2024)
ACL