@inproceedings{wang-etal-2025-agri,
title = "Agri-{CM}$^3$: A {C}hinese Massive Multi-modal, Multi-level Benchmark for Agricultural Understanding and Reasoning",
author = "Wang, Haotian and
Guan, Yi and
Meng, Fanshu and
Zhao, Chao and
Yan, Lian and
Yang, Yang and
Jiang, Jingchi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.576/",
doi = "10.18653/v1/2025.acl-long.576",
pages = "11729--11754",
ISBN = "979-8-89176-251-0",
abstract = "Multi-modal Large Language Models (MLLMs) integrating images, text, and speech can provide farmers with accurate diagnoses and treatment of pests and diseases, enhancing agricultural efficiency and sustainability. However, existing benchmarks lack comprehensive evaluations, particularly in multi-level reasoning, making it challenging to identify model limitations. To address this issue, we introduce Agri-CM$^3$, an expert-validated benchmark assessing MLLMs' understanding and reasoning in agricultural management. It includes 3,939 images and 15,901 multi-level multiple-choice questions with detailed explanations. Evaluations of 45 MLLMs reveal significant gaps. Even GPT-4o achieves only 63.64{\%} accuracy, falling short in fine-grained reasoning tasks. Analysis across three reasoning levels and seven compositional abilities highlights key challenges in accuracy and cognitive understanding. Our study provides insights for advancing MLLMs in agricultural management, driving their development and application. Code and data are available at \url{https://github.com/HIT-Kwoo/Agri-CM3}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-agri">
<titleInfo>
<title>Agri-CM³: A Chinese Massive Multi-modal, Multi-level Benchmark for Agricultural Understanding and Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haotian</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanshu</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lian</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingchi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Multi-modal Large Language Models (MLLMs) integrating images, text, and speech can provide farmers with accurate diagnoses and treatment of pests and diseases, enhancing agricultural efficiency and sustainability. However, existing benchmarks lack comprehensive evaluations, particularly in multi-level reasoning, making it challenging to identify model limitations. To address this issue, we introduce Agri-CM³, an expert-validated benchmark assessing MLLMs’ understanding and reasoning in agricultural management. It includes 3,939 images and 15,901 multi-level multiple-choice questions with detailed explanations. Evaluations of 45 MLLMs reveal significant gaps. Even GPT-4o achieves only 63.64% accuracy, falling short in fine-grained reasoning tasks. Analysis across three reasoning levels and seven compositional abilities highlights key challenges in accuracy and cognitive understanding. Our study provides insights for advancing MLLMs in agricultural management, driving their development and application. Code and data are available at https://github.com/HIT-Kwoo/Agri-CM3.</abstract>
<identifier type="citekey">wang-etal-2025-agri</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.576</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.576/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11729</start>
<end>11754</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Agri-CM³: A Chinese Massive Multi-modal, Multi-level Benchmark for Agricultural Understanding and Reasoning
%A Wang, Haotian
%A Guan, Yi
%A Meng, Fanshu
%A Zhao, Chao
%A Yan, Lian
%A Yang, Yang
%A Jiang, Jingchi
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F wang-etal-2025-agri
%X Multi-modal Large Language Models (MLLMs) integrating images, text, and speech can provide farmers with accurate diagnoses and treatment of pests and diseases, enhancing agricultural efficiency and sustainability. However, existing benchmarks lack comprehensive evaluations, particularly in multi-level reasoning, making it challenging to identify model limitations. To address this issue, we introduce Agri-CM³, an expert-validated benchmark assessing MLLMs’ understanding and reasoning in agricultural management. It includes 3,939 images and 15,901 multi-level multiple-choice questions with detailed explanations. Evaluations of 45 MLLMs reveal significant gaps. Even GPT-4o achieves only 63.64% accuracy, falling short in fine-grained reasoning tasks. Analysis across three reasoning levels and seven compositional abilities highlights key challenges in accuracy and cognitive understanding. Our study provides insights for advancing MLLMs in agricultural management, driving their development and application. Code and data are available at https://github.com/HIT-Kwoo/Agri-CM3.
%R 10.18653/v1/2025.acl-long.576
%U https://aclanthology.org/2025.acl-long.576/
%U https://doi.org/10.18653/v1/2025.acl-long.576
%P 11729-11754
Markdown (Informal)
[Agri-CM3: A Chinese Massive Multi-modal, Multi-level Benchmark for Agricultural Understanding and Reasoning](https://aclanthology.org/2025.acl-long.576/) (Wang et al., ACL 2025)
ACL
- Haotian Wang, Yi Guan, Fanshu Meng, Chao Zhao, Lian Yan, Yang Yang, and Jingchi Jiang. 2025. Agri-CM3: A Chinese Massive Multi-modal, Multi-level Benchmark for Agricultural Understanding and Reasoning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 11729–11754, Vienna, Austria. Association for Computational Linguistics.