@inproceedings{su-etal-2026-cafes,
title = "{CAFES}: A Collaborative Multi-Agent Framework for Multi-Granular Multimodal Essay Scoring",
author = "Su, Jiamin and
Yan, Yibo and
Gao, Zhuoran and
Zhang, Han and
Liu, Xiang and
Zhou, Huiyu and
Hu, Xuming",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.alvr-main.10/",
pages = "115--138",
ISBN = "979-8-89176-398-2",
abstract = "Automated Essay Scoring (AES) is crucial for modern education, particularly with the increasing prevalence of multimodal assessments. However, traditional AES methods struggle with evaluation generalizability and multimodal perception, while even recent Multimodal Large Language Model (MLLM)-based approaches can produce hallucinated justifications and scores misaligned with human judgment. To address the limitations, we introduce CAFES, the first collaborative multi-agent framework specifically designed for AES. It orchestrates three specialized agents: an Initial Scorer for rapid, trait-specific evaluations; a Feedback Pool Manager to aggregate detailed and evidence-grounded feedback; and a Reflective Scorer that iteratively refines scores based on this feedback to enhance human alignment. Extensive experiments, using widely adopted MLLMs, achieve an average relative improvement of 21{\%} in Quadratic Weighted Kappa (QWK) against ground truth, with particularly strong gains in grammatical and lexical diversity. Our proposed CAFES paves the way for an intelligent multimodal AES system. The code and dataset are available at https://anonymous.4open.science/r/CAFES-C87F/."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="su-etal-2026-cafes">
<titleInfo>
<title>CAFES: A Collaborative Multi-Agent Framework for Multi-Granular Multimodal Essay Scoring</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiamin</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yibo</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuoran</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiyu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuming</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qianqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alane</namePart>
<namePart type="family">Suhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="given">Eric</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-398-2</identifier>
</relatedItem>
<abstract>Automated Essay Scoring (AES) is crucial for modern education, particularly with the increasing prevalence of multimodal assessments. However, traditional AES methods struggle with evaluation generalizability and multimodal perception, while even recent Multimodal Large Language Model (MLLM)-based approaches can produce hallucinated justifications and scores misaligned with human judgment. To address the limitations, we introduce CAFES, the first collaborative multi-agent framework specifically designed for AES. It orchestrates three specialized agents: an Initial Scorer for rapid, trait-specific evaluations; a Feedback Pool Manager to aggregate detailed and evidence-grounded feedback; and a Reflective Scorer that iteratively refines scores based on this feedback to enhance human alignment. Extensive experiments, using widely adopted MLLMs, achieve an average relative improvement of 21% in Quadratic Weighted Kappa (QWK) against ground truth, with particularly strong gains in grammatical and lexical diversity. Our proposed CAFES paves the way for an intelligent multimodal AES system. The code and dataset are available at https://anonymous.4open.science/r/CAFES-C87F/.</abstract>
<identifier type="citekey">su-etal-2026-cafes</identifier>
<location>
<url>https://aclanthology.org/2026.alvr-main.10/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>115</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CAFES: A Collaborative Multi-Agent Framework for Multi-Granular Multimodal Essay Scoring
%A Su, Jiamin
%A Yan, Yibo
%A Gao, Zhuoran
%A Zhang, Han
%A Liu, Xiang
%A Zhou, Huiyu
%A Hu, Xuming
%Y Yan, Qianqi
%Y Montariol, Syrielle
%Y Fan, Yue
%Y Gu, Jing
%Y Pan, Jiayi
%Y Li, Manling
%Y Kordjamshidi, Parisa
%Y Suhr, Alane
%Y Wang, Xin Eric
%S Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-398-2
%F su-etal-2026-cafes
%X Automated Essay Scoring (AES) is crucial for modern education, particularly with the increasing prevalence of multimodal assessments. However, traditional AES methods struggle with evaluation generalizability and multimodal perception, while even recent Multimodal Large Language Model (MLLM)-based approaches can produce hallucinated justifications and scores misaligned with human judgment. To address the limitations, we introduce CAFES, the first collaborative multi-agent framework specifically designed for AES. It orchestrates three specialized agents: an Initial Scorer for rapid, trait-specific evaluations; a Feedback Pool Manager to aggregate detailed and evidence-grounded feedback; and a Reflective Scorer that iteratively refines scores based on this feedback to enhance human alignment. Extensive experiments, using widely adopted MLLMs, achieve an average relative improvement of 21% in Quadratic Weighted Kappa (QWK) against ground truth, with particularly strong gains in grammatical and lexical diversity. Our proposed CAFES paves the way for an intelligent multimodal AES system. The code and dataset are available at https://anonymous.4open.science/r/CAFES-C87F/.
%U https://aclanthology.org/2026.alvr-main.10/
%P 115-138
Markdown (Informal)
[CAFES: A Collaborative Multi-Agent Framework for Multi-Granular Multimodal Essay Scoring](https://aclanthology.org/2026.alvr-main.10/) (Su et al., ALVR 2026)
ACL
- Jiamin Su, Yibo Yan, Zhuoran Gao, Han Zhang, Xiang Liu, Huiyu Zhou, and Xuming Hu. 2026. CAFES: A Collaborative Multi-Agent Framework for Multi-Granular Multimodal Essay Scoring. In Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR), pages 115–138, San Diego, California, USA. Association for Computational Linguistics.