@inproceedings{zhao-etal-2024-difficult,
title = "Difficult Task Yes but Simple Task No: Unveiling the Laziness in Multimodal {LLM}s",
author = "Zhao, Sihang and
Yuan, Youliang and
Tang, Xiaoying and
He, Pinjia",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.442/",
doi = "10.18653/v1/2024.findings-emnlp.442",
pages = "7535--7548",
abstract = "Multimodal Large Language Models (MLLMs) demonstrate a strong understanding of the real world and can even handle complex tasks. However, they still fail on some straightforward visual question-answering (VQA) problems. This paper dives deeper into this issue, revealing that models tend to err when answering easy questions (e.g., Yes/No questions) about an image, even though they can correctly describe it.We refer to this model behavior discrepancy between difficult and simple questions as model laziness.To systematically investigate model laziness, we manually construct LazyBench, a benchmark that includes Yes/No, multiple choice, short answer questions, and image description tasks that are related to the same subjects in the images.Based on LazyBench. we observe that laziness widely exists in current advanced MLLMs (e.g., GPT-4o, Gemini-1.5-pro, Claude 3, LLaVA-1.5, LLaVA-1.6, and QWen-VL). We also analyzed the failure cases of LLaVA-1.5-13B on the VQA-v2 benchmark and discovered that about half of these failures are due to the model`s laziness. This further highlights the importance of ensuring that the model fully utilizes its capability.To this end, we conduct a preliminary exploration of how to mitigate laziness and find that chain of thought can effectively avoid this issue. The data can be accessed at https://github.com/Akutagawa1998/LazyBench."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2024-difficult">
<titleInfo>
<title>Difficult Task Yes but Simple Task No: Unveiling the Laziness in Multimodal LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sihang</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youliang</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoying</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinjia</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal Large Language Models (MLLMs) demonstrate a strong understanding of the real world and can even handle complex tasks. However, they still fail on some straightforward visual question-answering (VQA) problems. This paper dives deeper into this issue, revealing that models tend to err when answering easy questions (e.g., Yes/No questions) about an image, even though they can correctly describe it.We refer to this model behavior discrepancy between difficult and simple questions as model laziness.To systematically investigate model laziness, we manually construct LazyBench, a benchmark that includes Yes/No, multiple choice, short answer questions, and image description tasks that are related to the same subjects in the images.Based on LazyBench. we observe that laziness widely exists in current advanced MLLMs (e.g., GPT-4o, Gemini-1.5-pro, Claude 3, LLaVA-1.5, LLaVA-1.6, and QWen-VL). We also analyzed the failure cases of LLaVA-1.5-13B on the VQA-v2 benchmark and discovered that about half of these failures are due to the model‘s laziness. This further highlights the importance of ensuring that the model fully utilizes its capability.To this end, we conduct a preliminary exploration of how to mitigate laziness and find that chain of thought can effectively avoid this issue. The data can be accessed at https://github.com/Akutagawa1998/LazyBench.</abstract>
<identifier type="citekey">zhao-etal-2024-difficult</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.442</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.442/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>7535</start>
<end>7548</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Difficult Task Yes but Simple Task No: Unveiling the Laziness in Multimodal LLMs
%A Zhao, Sihang
%A Yuan, Youliang
%A Tang, Xiaoying
%A He, Pinjia
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zhao-etal-2024-difficult
%X Multimodal Large Language Models (MLLMs) demonstrate a strong understanding of the real world and can even handle complex tasks. However, they still fail on some straightforward visual question-answering (VQA) problems. This paper dives deeper into this issue, revealing that models tend to err when answering easy questions (e.g., Yes/No questions) about an image, even though they can correctly describe it.We refer to this model behavior discrepancy between difficult and simple questions as model laziness.To systematically investigate model laziness, we manually construct LazyBench, a benchmark that includes Yes/No, multiple choice, short answer questions, and image description tasks that are related to the same subjects in the images.Based on LazyBench. we observe that laziness widely exists in current advanced MLLMs (e.g., GPT-4o, Gemini-1.5-pro, Claude 3, LLaVA-1.5, LLaVA-1.6, and QWen-VL). We also analyzed the failure cases of LLaVA-1.5-13B on the VQA-v2 benchmark and discovered that about half of these failures are due to the model‘s laziness. This further highlights the importance of ensuring that the model fully utilizes its capability.To this end, we conduct a preliminary exploration of how to mitigate laziness and find that chain of thought can effectively avoid this issue. The data can be accessed at https://github.com/Akutagawa1998/LazyBench.
%R 10.18653/v1/2024.findings-emnlp.442
%U https://aclanthology.org/2024.findings-emnlp.442/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.442
%P 7535-7548
Markdown (Informal)
[Difficult Task Yes but Simple Task No: Unveiling the Laziness in Multimodal LLMs](https://aclanthology.org/2024.findings-emnlp.442/) (Zhao et al., Findings 2024)
ACL