@inproceedings{su-etal-2025-climateviz,
title = "{C}limate{V}iz: A Benchmark for Statistical Reasoning and Fact Verification on Scientific Charts",
author = "Su, Ruiran and
Si, Jiasheng and
Guo, Zhijiang and
Pierrehumbert, Janet B.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1196/",
pages = "23447--23469",
ISBN = "979-8-89176-332-6",
abstract = "Scientific fact-checking has largely focused on textual and tabular sources, neglecting scientific charts{---}a primary medium for conveying quantitative evidence and supporting statistical reasoning in research communication. We introduce ClimateViz, the first large-scale benchmark for scientific fact-checking grounded in real-world, expert-curated scientific charts. ClimateViz comprises 49,862 claims paired with 2,896 visualizations, each labeled as support, refute, or not enough information. To enable interpretable verification, each instance includes structured knowledge graph explanations that capture statistical patterns, temporal trends, spatial comparisons, and causal relations. We conduct a comprehensive evaluation of state-of-the-art multimodal large language models, including proprietary and open-source ones, under zero-shot and few-shot settings. Our results show that current models struggle to perform fact-checking when statistical reasoning over charts is required: even the best-performing systems, such as Gemini 2.5 and InternVL 2.5, achieve only 76.2{--}77.8{\%} accuracy in label-only output settings, which is far below human performance (89.3{\%} and 92.7{\%}). While few-shot prompting yields limited improvements, explanation-augmented outputs significantly enhance performance in some closed-source models, notably o3 and Gemini 2.5."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="su-etal-2025-climateviz">
<titleInfo>
<title>ClimateViz: A Benchmark for Statistical Reasoning and Fact Verification on Scientific Charts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruiran</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiasheng</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijiang</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janet</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Pierrehumbert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Scientific fact-checking has largely focused on textual and tabular sources, neglecting scientific charts—a primary medium for conveying quantitative evidence and supporting statistical reasoning in research communication. We introduce ClimateViz, the first large-scale benchmark for scientific fact-checking grounded in real-world, expert-curated scientific charts. ClimateViz comprises 49,862 claims paired with 2,896 visualizations, each labeled as support, refute, or not enough information. To enable interpretable verification, each instance includes structured knowledge graph explanations that capture statistical patterns, temporal trends, spatial comparisons, and causal relations. We conduct a comprehensive evaluation of state-of-the-art multimodal large language models, including proprietary and open-source ones, under zero-shot and few-shot settings. Our results show that current models struggle to perform fact-checking when statistical reasoning over charts is required: even the best-performing systems, such as Gemini 2.5 and InternVL 2.5, achieve only 76.2–77.8% accuracy in label-only output settings, which is far below human performance (89.3% and 92.7%). While few-shot prompting yields limited improvements, explanation-augmented outputs significantly enhance performance in some closed-source models, notably o3 and Gemini 2.5.</abstract>
<identifier type="citekey">su-etal-2025-climateviz</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1196/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>23447</start>
<end>23469</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ClimateViz: A Benchmark for Statistical Reasoning and Fact Verification on Scientific Charts
%A Su, Ruiran
%A Si, Jiasheng
%A Guo, Zhijiang
%A Pierrehumbert, Janet B.
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F su-etal-2025-climateviz
%X Scientific fact-checking has largely focused on textual and tabular sources, neglecting scientific charts—a primary medium for conveying quantitative evidence and supporting statistical reasoning in research communication. We introduce ClimateViz, the first large-scale benchmark for scientific fact-checking grounded in real-world, expert-curated scientific charts. ClimateViz comprises 49,862 claims paired with 2,896 visualizations, each labeled as support, refute, or not enough information. To enable interpretable verification, each instance includes structured knowledge graph explanations that capture statistical patterns, temporal trends, spatial comparisons, and causal relations. We conduct a comprehensive evaluation of state-of-the-art multimodal large language models, including proprietary and open-source ones, under zero-shot and few-shot settings. Our results show that current models struggle to perform fact-checking when statistical reasoning over charts is required: even the best-performing systems, such as Gemini 2.5 and InternVL 2.5, achieve only 76.2–77.8% accuracy in label-only output settings, which is far below human performance (89.3% and 92.7%). While few-shot prompting yields limited improvements, explanation-augmented outputs significantly enhance performance in some closed-source models, notably o3 and Gemini 2.5.
%U https://aclanthology.org/2025.emnlp-main.1196/
%P 23447-23469
Markdown (Informal)
[ClimateViz: A Benchmark for Statistical Reasoning and Fact Verification on Scientific Charts](https://aclanthology.org/2025.emnlp-main.1196/) (Su et al., EMNLP 2025)
ACL