@inproceedings{li-etal-2025-cmmath,
title = "{CMM}a{TH}: A {C}hinese Multi-modal Math Skill Evaluation Benchmark for Foundation Models",
author = "Li, Zhongzhi and
Zhang, Ming-Liang and
Wang, Pei-Jie and
Xu, Jian and
Zhang, Rui-Song and
Fei, Yin and
Ji, Zhi-Long and
Bai, Jin-Feng and
Pan, Zhen-Ru and
Zhang, Jiaxin and
Liu, Cheng-Lin",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.184/",
pages = "2690--2726",
abstract = "With the rapid advancements in multimodal large language models, evaluating their multimodal mathematical capabilities continues to receive wide attention. Although datasets such as MathVista have been introduced for evaluating mathematical capabilities in multimodal scenarios, there remains a lack of evaluation tools and datasets tailored for fine-grained assessment in Chinese K12 education. To systematically evaluate the ability of multimodal large models to solve Chinese multimodal mathematical problems, we propose a Chinese Multi-modal Math Skill Evaluation Benchmark (CMMaTH), containing 23,856 multimodal K12 math related questions, making it the largest Chinese multimodal mathematical problem benchmark to date. CMMaTH includes questions ranging from elementary to high school levels, offering greater diversity in problem types, solution goals, visual elements, detailed knowledge points, and standard solution annotations. To facilitate stable, fast, and cost-free model evaluation, we have developed an open-source tool called GradeGPT, which is integrated with the CMMaTH dataset. Our data and code are available at \url{https://github.com/zzli2022/CMMaTH}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-cmmath">
<titleInfo>
<title>CMMaTH: A Chinese Multi-modal Math Skill Evaluation Benchmark for Foundation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhongzhi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming-Liang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pei-Jie</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui-Song</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yin</namePart>
<namePart type="family">Fei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi-Long</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jin-Feng</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen-Ru</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng-Lin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the rapid advancements in multimodal large language models, evaluating their multimodal mathematical capabilities continues to receive wide attention. Although datasets such as MathVista have been introduced for evaluating mathematical capabilities in multimodal scenarios, there remains a lack of evaluation tools and datasets tailored for fine-grained assessment in Chinese K12 education. To systematically evaluate the ability of multimodal large models to solve Chinese multimodal mathematical problems, we propose a Chinese Multi-modal Math Skill Evaluation Benchmark (CMMaTH), containing 23,856 multimodal K12 math related questions, making it the largest Chinese multimodal mathematical problem benchmark to date. CMMaTH includes questions ranging from elementary to high school levels, offering greater diversity in problem types, solution goals, visual elements, detailed knowledge points, and standard solution annotations. To facilitate stable, fast, and cost-free model evaluation, we have developed an open-source tool called GradeGPT, which is integrated with the CMMaTH dataset. Our data and code are available at https://github.com/zzli2022/CMMaTH.</abstract>
<identifier type="citekey">li-etal-2025-cmmath</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.184/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>2690</start>
<end>2726</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CMMaTH: A Chinese Multi-modal Math Skill Evaluation Benchmark for Foundation Models
%A Li, Zhongzhi
%A Zhang, Ming-Liang
%A Wang, Pei-Jie
%A Xu, Jian
%A Zhang, Rui-Song
%A Fei, Yin
%A Ji, Zhi-Long
%A Bai, Jin-Feng
%A Pan, Zhen-Ru
%A Zhang, Jiaxin
%A Liu, Cheng-Lin
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F li-etal-2025-cmmath
%X With the rapid advancements in multimodal large language models, evaluating their multimodal mathematical capabilities continues to receive wide attention. Although datasets such as MathVista have been introduced for evaluating mathematical capabilities in multimodal scenarios, there remains a lack of evaluation tools and datasets tailored for fine-grained assessment in Chinese K12 education. To systematically evaluate the ability of multimodal large models to solve Chinese multimodal mathematical problems, we propose a Chinese Multi-modal Math Skill Evaluation Benchmark (CMMaTH), containing 23,856 multimodal K12 math related questions, making it the largest Chinese multimodal mathematical problem benchmark to date. CMMaTH includes questions ranging from elementary to high school levels, offering greater diversity in problem types, solution goals, visual elements, detailed knowledge points, and standard solution annotations. To facilitate stable, fast, and cost-free model evaluation, we have developed an open-source tool called GradeGPT, which is integrated with the CMMaTH dataset. Our data and code are available at https://github.com/zzli2022/CMMaTH.
%U https://aclanthology.org/2025.coling-main.184/
%P 2690-2726
Markdown (Informal)
[CMMaTH: A Chinese Multi-modal Math Skill Evaluation Benchmark for Foundation Models](https://aclanthology.org/2025.coling-main.184/) (Li et al., COLING 2025)
ACL
- Zhongzhi Li, Ming-Liang Zhang, Pei-Jie Wang, Jian Xu, Rui-Song Zhang, Yin Fei, Zhi-Long Ji, Jin-Feng Bai, Zhen-Ru Pan, Jiaxin Zhang, and Cheng-Lin Liu. 2025. CMMaTH: A Chinese Multi-modal Math Skill Evaluation Benchmark for Foundation Models. In Proceedings of the 31st International Conference on Computational Linguistics, pages 2690–2726, Abu Dhabi, UAE. Association for Computational Linguistics.