@inproceedings{ymyang-etal-2026-chart,
title = "Chart-{MRAG}: Benchmarking Multimodal Retrieval Augmented Generation on Chart-based Documents",
author = "Ymyang and
Zhong, Jiang and
Jin, Li and
Sun, Xiao and
Huang, Jingwang and
Gaojinpeng and
Liu, Qing and
Bai, Yang and
Zhang, Jingyuan and
Jiang, Rui and
Lei, Qin and
Wei, Kaiwen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1164/",
pages = "25392--25445",
ISBN = "979-8-89176-390-6",
abstract = "Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, $\textbf{Chart-based MRAG}$, to address this limitation. To generate high-quality evaluation samples, we propose $\textbf{CHARGE}$ ($\textbf{CHAR}$t-based document question-answering $\textbf{GE}$neration), a semi-automatic framework for generating evaluation samples through multi-modal keypoint extraction, knowledge graph construction, and qa pair synthesis.By combining CHARGE with expert validation, we construct $\textbf{Chart-MRAG Bench}$, a comprehensive benchmark for chart-based MRAG evaluation, featuring 4,738 question-answering pairs across 8 domains from real-world documents.Our experiments reveal three critical limitations in current approaches: (1) unified multimodal embedding retrieval methods struggles in chart-based scenarios, (2) even with ground-truth retrieval, state-of-the-art Multimodal Large Language Models (MLLMs) achieve only 71.15{\%} Correctness and 80.74{\%} Coverage scores, and (3) Widely-used MLLMs demonstrate consistent text-over-visual modality bias. These findings highlight great challenges in processing information-dense visual formats. We will make our code and dataset publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ymyang-etal-2026-chart">
<titleInfo>
<title>Chart-MRAG: Benchmarking Multimodal Retrieval Augmented Generation on Chart-based Documents</title>
</titleInfo>
<name>
<namePart>Ymyang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiang</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingwang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Gaojinpeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qin</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiwen</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, Chart-based MRAG, to address this limitation. To generate high-quality evaluation samples, we propose CHARGE (CHARt-based document question-answering GEneration), a semi-automatic framework for generating evaluation samples through multi-modal keypoint extraction, knowledge graph construction, and qa pair synthesis.By combining CHARGE with expert validation, we construct Chart-MRAG Bench, a comprehensive benchmark for chart-based MRAG evaluation, featuring 4,738 question-answering pairs across 8 domains from real-world documents.Our experiments reveal three critical limitations in current approaches: (1) unified multimodal embedding retrieval methods struggles in chart-based scenarios, (2) even with ground-truth retrieval, state-of-the-art Multimodal Large Language Models (MLLMs) achieve only 71.15% Correctness and 80.74% Coverage scores, and (3) Widely-used MLLMs demonstrate consistent text-over-visual modality bias. These findings highlight great challenges in processing information-dense visual formats. We will make our code and dataset publicly available.</abstract>
<identifier type="citekey">ymyang-etal-2026-chart</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1164/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>25392</start>
<end>25445</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chart-MRAG: Benchmarking Multimodal Retrieval Augmented Generation on Chart-based Documents
%A Zhong, Jiang
%A Jin, Li
%A Sun, Xiao
%A Huang, Jingwang
%A Liu, Qing
%A Bai, Yang
%A Zhang, Jingyuan
%A Jiang, Rui
%A Lei, Qin
%A Wei, Kaiwen
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Ymyang
%A Gaojinpeng
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F ymyang-etal-2026-chart
%X Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, Chart-based MRAG, to address this limitation. To generate high-quality evaluation samples, we propose CHARGE (CHARt-based document question-answering GEneration), a semi-automatic framework for generating evaluation samples through multi-modal keypoint extraction, knowledge graph construction, and qa pair synthesis.By combining CHARGE with expert validation, we construct Chart-MRAG Bench, a comprehensive benchmark for chart-based MRAG evaluation, featuring 4,738 question-answering pairs across 8 domains from real-world documents.Our experiments reveal three critical limitations in current approaches: (1) unified multimodal embedding retrieval methods struggles in chart-based scenarios, (2) even with ground-truth retrieval, state-of-the-art Multimodal Large Language Models (MLLMs) achieve only 71.15% Correctness and 80.74% Coverage scores, and (3) Widely-used MLLMs demonstrate consistent text-over-visual modality bias. These findings highlight great challenges in processing information-dense visual formats. We will make our code and dataset publicly available.
%U https://aclanthology.org/2026.acl-long.1164/
%P 25392-25445
Markdown (Informal)
[Chart-MRAG: Benchmarking Multimodal Retrieval Augmented Generation on Chart-based Documents](https://aclanthology.org/2026.acl-long.1164/) (Ymyang et al., ACL 2026)
ACL
- Ymyang, Jiang Zhong, Li Jin, Xiao Sun, Jingwang Huang, Gaojinpeng, Qing Liu, Yang Bai, Jingyuan Zhang, Rui Jiang, Qin Lei, and Kaiwen Wei. 2026. Chart-MRAG: Benchmarking Multimodal Retrieval Augmented Generation on Chart-based Documents. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 25392–25445, San Diego, California, United States. Association for Computational Linguistics.