@inproceedings{mukhopadhyay-etal-2024-unraveling,
title = "Unraveling the Truth: Do {VLM}s really Understand Charts? A Deep Dive into Consistency and Robustness",
author = "Mukhopadhyay, Srija and
Qidwai, Adnan and
Garimella, Aparna and
Ramu, Pritika and
Gupta, Vivek and
Roth, Dan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.973",
doi = "10.18653/v1/2024.findings-emnlp.973",
pages = "16696--16717",
abstract = "Chart question answering (CQA) is a crucial area of Visual Language Understanding. However, the robustness and consistency of current Visual Language Models (VLMs) in this field remain under-explored. This paper evaluates state-of-the-art VLMs on comprehensive datasets, developed specifically for this study, encompassing diverse question categories and chart formats. We investigate two key aspects: 1) the models{'} ability to handle varying levels of chart and question complexity, and 2) their robustness across different visual representations of the same underlying data. Our analysis reveals significant performance variations based on question and chart types, highlighting both strengths and weaknesses of current models. Additionally, we identify areas for improvement and propose future research directions to build more robust and reliable CQA systems. This study sheds light on the limitations of current models and paves the way for future advancements in the field.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mukhopadhyay-etal-2024-unraveling">
<titleInfo>
<title>Unraveling the Truth: Do VLMs really Understand Charts? A Deep Dive into Consistency and Robustness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Srija</namePart>
<namePart type="family">Mukhopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adnan</namePart>
<namePart type="family">Qidwai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Garimella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pritika</namePart>
<namePart type="family">Ramu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Chart question answering (CQA) is a crucial area of Visual Language Understanding. However, the robustness and consistency of current Visual Language Models (VLMs) in this field remain under-explored. This paper evaluates state-of-the-art VLMs on comprehensive datasets, developed specifically for this study, encompassing diverse question categories and chart formats. We investigate two key aspects: 1) the models’ ability to handle varying levels of chart and question complexity, and 2) their robustness across different visual representations of the same underlying data. Our analysis reveals significant performance variations based on question and chart types, highlighting both strengths and weaknesses of current models. Additionally, we identify areas for improvement and propose future research directions to build more robust and reliable CQA systems. This study sheds light on the limitations of current models and paves the way for future advancements in the field.</abstract>
<identifier type="citekey">mukhopadhyay-etal-2024-unraveling</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.973</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.973</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>16696</start>
<end>16717</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unraveling the Truth: Do VLMs really Understand Charts? A Deep Dive into Consistency and Robustness
%A Mukhopadhyay, Srija
%A Qidwai, Adnan
%A Garimella, Aparna
%A Ramu, Pritika
%A Gupta, Vivek
%A Roth, Dan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F mukhopadhyay-etal-2024-unraveling
%X Chart question answering (CQA) is a crucial area of Visual Language Understanding. However, the robustness and consistency of current Visual Language Models (VLMs) in this field remain under-explored. This paper evaluates state-of-the-art VLMs on comprehensive datasets, developed specifically for this study, encompassing diverse question categories and chart formats. We investigate two key aspects: 1) the models’ ability to handle varying levels of chart and question complexity, and 2) their robustness across different visual representations of the same underlying data. Our analysis reveals significant performance variations based on question and chart types, highlighting both strengths and weaknesses of current models. Additionally, we identify areas for improvement and propose future research directions to build more robust and reliable CQA systems. This study sheds light on the limitations of current models and paves the way for future advancements in the field.
%R 10.18653/v1/2024.findings-emnlp.973
%U https://aclanthology.org/2024.findings-emnlp.973
%U https://doi.org/10.18653/v1/2024.findings-emnlp.973
%P 16696-16717
Markdown (Informal)
[Unraveling the Truth: Do VLMs really Understand Charts? A Deep Dive into Consistency and Robustness](https://aclanthology.org/2024.findings-emnlp.973) (Mukhopadhyay et al., Findings 2024)
ACL