@inproceedings{zhou-etal-2025-miceval,
title = "{M}i{CE}val: Unveiling Multimodal Chain of Thought{'}s Quality via Image Description and Reasoning Steps",
author = "Zhou, Xiongtao and
He, Jie and
Chen, Lanyu and
Li, Jingyu and
Chen, Haojing and
Gutierrez Basulto, Victor and
Pan, Jeff Z. and
Chen, Hanjie",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.504/",
doi = "10.18653/v1/2025.naacl-long.504",
pages = "10002--10039",
ISBN = "979-8-89176-189-6",
abstract = "**Multimodal Chain of Thought (MCoT)** is a popular prompting strategy for improving the performance of multimodal large language models (MLLMs) across a range of complex reasoning tasks. Despite its popularity, there is a notable absence of automated methods for evaluating the quality of reasoning steps in MCoT. To address this gap, we propose **Multimodal Chain-of-Thought Evaluation (MiCEval)**, a framework designed to assess the correctness of reasoning chains by evaluating the quality of both the description and each reasoning step. The evaluation of the description component focuses on the accuracy of the image descriptions, while the reasoning step evaluates the quality of each step as it is conditionally generated based on the preceding steps. MiCEval is built upon a fine-grained dataset with annotations that rate each step according to correctness, relevance, and informativeness. Extensive experiments on four state-of-the-art MLLMs show that step-wise evaluations using MiCEval align more closely with human judgments compared to existing methods based on cosine similarity or fine-tuning approaches. MiCEval datasets and code can be found at: [https://anonymous{\_}github/MicEval](https://anonymous.4open.science/r/MiCEval-847F/README.md)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-miceval">
<titleInfo>
<title>MiCEval: Unveiling Multimodal Chain of Thought’s Quality via Image Description and Reasoning Steps</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiongtao</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lanyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haojing</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Gutierrez Basulto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeff</namePart>
<namePart type="given">Z</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>**Multimodal Chain of Thought (MCoT)** is a popular prompting strategy for improving the performance of multimodal large language models (MLLMs) across a range of complex reasoning tasks. Despite its popularity, there is a notable absence of automated methods for evaluating the quality of reasoning steps in MCoT. To address this gap, we propose **Multimodal Chain-of-Thought Evaluation (MiCEval)**, a framework designed to assess the correctness of reasoning chains by evaluating the quality of both the description and each reasoning step. The evaluation of the description component focuses on the accuracy of the image descriptions, while the reasoning step evaluates the quality of each step as it is conditionally generated based on the preceding steps. MiCEval is built upon a fine-grained dataset with annotations that rate each step according to correctness, relevance, and informativeness. Extensive experiments on four state-of-the-art MLLMs show that step-wise evaluations using MiCEval align more closely with human judgments compared to existing methods based on cosine similarity or fine-tuning approaches. MiCEval datasets and code can be found at: [https://anonymous_github/MicEval](https://anonymous.4open.science/r/MiCEval-847F/README.md).</abstract>
<identifier type="citekey">zhou-etal-2025-miceval</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.504</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.504/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>10002</start>
<end>10039</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MiCEval: Unveiling Multimodal Chain of Thought’s Quality via Image Description and Reasoning Steps
%A Zhou, Xiongtao
%A He, Jie
%A Chen, Lanyu
%A Li, Jingyu
%A Chen, Haojing
%A Gutierrez Basulto, Victor
%A Pan, Jeff Z.
%A Chen, Hanjie
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F zhou-etal-2025-miceval
%X **Multimodal Chain of Thought (MCoT)** is a popular prompting strategy for improving the performance of multimodal large language models (MLLMs) across a range of complex reasoning tasks. Despite its popularity, there is a notable absence of automated methods for evaluating the quality of reasoning steps in MCoT. To address this gap, we propose **Multimodal Chain-of-Thought Evaluation (MiCEval)**, a framework designed to assess the correctness of reasoning chains by evaluating the quality of both the description and each reasoning step. The evaluation of the description component focuses on the accuracy of the image descriptions, while the reasoning step evaluates the quality of each step as it is conditionally generated based on the preceding steps. MiCEval is built upon a fine-grained dataset with annotations that rate each step according to correctness, relevance, and informativeness. Extensive experiments on four state-of-the-art MLLMs show that step-wise evaluations using MiCEval align more closely with human judgments compared to existing methods based on cosine similarity or fine-tuning approaches. MiCEval datasets and code can be found at: [https://anonymous_github/MicEval](https://anonymous.4open.science/r/MiCEval-847F/README.md).
%R 10.18653/v1/2025.naacl-long.504
%U https://aclanthology.org/2025.naacl-long.504/
%U https://doi.org/10.18653/v1/2025.naacl-long.504
%P 10002-10039
Markdown (Informal)
[MiCEval: Unveiling Multimodal Chain of Thought’s Quality via Image Description and Reasoning Steps](https://aclanthology.org/2025.naacl-long.504/) (Zhou et al., NAACL 2025)
ACL
- Xiongtao Zhou, Jie He, Lanyu Chen, Jingyu Li, Haojing Chen, Victor Gutierrez Basulto, Jeff Z. Pan, and Hanjie Chen. 2025. MiCEval: Unveiling Multimodal Chain of Thought’s Quality via Image Description and Reasoning Steps. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 10002–10039, Albuquerque, New Mexico. Association for Computational Linguistics.