@inproceedings{kumar-etal-2025-courteval,
title = "{C}ourt{E}val: A Courtroom-Based Multi-Agent Evaluation Framework",
author = "Kumar, Sandeep and
Nargund, Abhijit A and
Sridhar, Vivek",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1327/",
doi = "10.18653/v1/2025.findings-acl.1327",
pages = "25875--25887",
ISBN = "979-8-89176-256-5",
abstract = "Automated evaluation is crucial for assessing the quality of natural language text, especially in open-ended generation tasks, given the costly and time-consuming nature of human evaluation. Existing automatic evaluation metrics like ROUGE and BLEU often show low correlation with human judgments. As large language models (LLMs) continue to evolve, researchers have explored their use as alternatives to human evaluators. Although single-agent approaches have shown potential, results indicate that further progress is required to close the gap between their performance and the quality of human assessments. Acknowledging that human evaluations involve multiple annotators, the multi-agent approach allows LLMs to collaborate, enhancing efficiency and effectiveness in handling complex tasks. In this paper, we present CourtEval, a novel Multi-Agent Evaluation Framework modeled after courtroom dynamics. Each agent takes on a distinct role: the Grader, similar to a judge, assigns an initial score; the Critic, like a prosecutor, challenges this score; and the Defender, akin to a defense attorney, defends it. Based on the input from both the Critic and Defender, the Grader re-evaluates the score, leading to a more balanced and fair final decision through this adversarial process. CourtEval substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2025-courteval">
<titleInfo>
<title>CourtEval: A Courtroom-Based Multi-Agent Evaluation Framework</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandeep</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhijit</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Nargund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Sridhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Automated evaluation is crucial for assessing the quality of natural language text, especially in open-ended generation tasks, given the costly and time-consuming nature of human evaluation. Existing automatic evaluation metrics like ROUGE and BLEU often show low correlation with human judgments. As large language models (LLMs) continue to evolve, researchers have explored their use as alternatives to human evaluators. Although single-agent approaches have shown potential, results indicate that further progress is required to close the gap between their performance and the quality of human assessments. Acknowledging that human evaluations involve multiple annotators, the multi-agent approach allows LLMs to collaborate, enhancing efficiency and effectiveness in handling complex tasks. In this paper, we present CourtEval, a novel Multi-Agent Evaluation Framework modeled after courtroom dynamics. Each agent takes on a distinct role: the Grader, similar to a judge, assigns an initial score; the Critic, like a prosecutor, challenges this score; and the Defender, akin to a defense attorney, defends it. Based on the input from both the Critic and Defender, the Grader re-evaluates the score, leading to a more balanced and fair final decision through this adversarial process. CourtEval substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat.</abstract>
<identifier type="citekey">kumar-etal-2025-courteval</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1327</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1327/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25875</start>
<end>25887</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CourtEval: A Courtroom-Based Multi-Agent Evaluation Framework
%A Kumar, Sandeep
%A Nargund, Abhijit A.
%A Sridhar, Vivek
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F kumar-etal-2025-courteval
%X Automated evaluation is crucial for assessing the quality of natural language text, especially in open-ended generation tasks, given the costly and time-consuming nature of human evaluation. Existing automatic evaluation metrics like ROUGE and BLEU often show low correlation with human judgments. As large language models (LLMs) continue to evolve, researchers have explored their use as alternatives to human evaluators. Although single-agent approaches have shown potential, results indicate that further progress is required to close the gap between their performance and the quality of human assessments. Acknowledging that human evaluations involve multiple annotators, the multi-agent approach allows LLMs to collaborate, enhancing efficiency and effectiveness in handling complex tasks. In this paper, we present CourtEval, a novel Multi-Agent Evaluation Framework modeled after courtroom dynamics. Each agent takes on a distinct role: the Grader, similar to a judge, assigns an initial score; the Critic, like a prosecutor, challenges this score; and the Defender, akin to a defense attorney, defends it. Based on the input from both the Critic and Defender, the Grader re-evaluates the score, leading to a more balanced and fair final decision through this adversarial process. CourtEval substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat.
%R 10.18653/v1/2025.findings-acl.1327
%U https://aclanthology.org/2025.findings-acl.1327/
%U https://doi.org/10.18653/v1/2025.findings-acl.1327
%P 25875-25887
Markdown (Informal)
[CourtEval: A Courtroom-Based Multi-Agent Evaluation Framework](https://aclanthology.org/2025.findings-acl.1327/) (Kumar et al., Findings 2025)
ACL