@inproceedings{donati-etal-2025-large,
title = "Do Large Language Models understand how to be judges?",
author = "Donati, Nicol{\`o} and
Torroni, Paolo and
Savino, Giuseppe",
editor = "Cardoso, Henrique Lopes and
Sousa-Silva, Rui and
Koponen, Maarit and
Pareja-Lora, Antonio",
booktitle = "Proceedings of the 2nd LUHME Workshop",
month = oct,
year = "2025",
address = "Bologna, Italy",
publisher = "LUHME",
url = "https://aclanthology.org/2025.luhme-1.9/",
pages = "85--102",
abstract = "This paper investigates whether Large Language Models (LLMs) can effectively act as judges for evaluating open-ended text generation tasks, such as summarization, by interpreting nuanced editorial criteria. Traditional metrics like ROUGE and BLEU rely on surface-level overlap, while human evaluations remain costly and inconsistent. To address this, we propose a structured rubric with five dimensions: coherence, consistency, fluency, relevance, and ordering, each defined with explicit sub-criteria to guide LLMs in assessing semantic fidelity and structural quality. Using a purpose-built dataset of Italian news summaries generated by GPT-4o, each tailored to isolate specific criteria, we evaluate LLMs' ability to assign scores and rationales aligned with expert human judgments. Results show moderate alignment (Spearman{'}s {\ensuremath{\rho}} = 0.6{--}0.7) for criteria like relevance but reveal systematic biases, such as overestimating fluency and coherence, likely due to training data biases. We identify challenges in rubric interpretation, particularly for hierarchical or abstract criteria, and highlight limitations in cross-genre generalization. The study underscores the potential of LLMs as scalable evaluators but emphasizes the need for fine-tuning, diverse benchmarks, and refined rubrics to mitigate biases and enhance reliability. Future directions include expanding to multilingual and multi-genre contexts and exploring task-specific instruction tuning to improve alignment with human editorial standards."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="donati-etal-2025-large">
<titleInfo>
<title>Do Large Language Models understand how to be judges?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicolò</namePart>
<namePart type="family">Donati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paolo</namePart>
<namePart type="family">Torroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Savino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd LUHME Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Henrique</namePart>
<namePart type="given">Lopes</namePart>
<namePart type="family">Cardoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Sousa-Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarit</namePart>
<namePart type="family">Koponen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Pareja-Lora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>LUHME</publisher>
<place>
<placeTerm type="text">Bologna, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper investigates whether Large Language Models (LLMs) can effectively act as judges for evaluating open-ended text generation tasks, such as summarization, by interpreting nuanced editorial criteria. Traditional metrics like ROUGE and BLEU rely on surface-level overlap, while human evaluations remain costly and inconsistent. To address this, we propose a structured rubric with five dimensions: coherence, consistency, fluency, relevance, and ordering, each defined with explicit sub-criteria to guide LLMs in assessing semantic fidelity and structural quality. Using a purpose-built dataset of Italian news summaries generated by GPT-4o, each tailored to isolate specific criteria, we evaluate LLMs’ ability to assign scores and rationales aligned with expert human judgments. Results show moderate alignment (Spearman’s \ensuremathρ = 0.6–0.7) for criteria like relevance but reveal systematic biases, such as overestimating fluency and coherence, likely due to training data biases. We identify challenges in rubric interpretation, particularly for hierarchical or abstract criteria, and highlight limitations in cross-genre generalization. The study underscores the potential of LLMs as scalable evaluators but emphasizes the need for fine-tuning, diverse benchmarks, and refined rubrics to mitigate biases and enhance reliability. Future directions include expanding to multilingual and multi-genre contexts and exploring task-specific instruction tuning to improve alignment with human editorial standards.</abstract>
<identifier type="citekey">donati-etal-2025-large</identifier>
<location>
<url>https://aclanthology.org/2025.luhme-1.9/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>85</start>
<end>102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Large Language Models understand how to be judges?
%A Donati, Nicolò
%A Torroni, Paolo
%A Savino, Giuseppe
%Y Cardoso, Henrique Lopes
%Y Sousa-Silva, Rui
%Y Koponen, Maarit
%Y Pareja-Lora, Antonio
%S Proceedings of the 2nd LUHME Workshop
%D 2025
%8 October
%I LUHME
%C Bologna, Italy
%F donati-etal-2025-large
%X This paper investigates whether Large Language Models (LLMs) can effectively act as judges for evaluating open-ended text generation tasks, such as summarization, by interpreting nuanced editorial criteria. Traditional metrics like ROUGE and BLEU rely on surface-level overlap, while human evaluations remain costly and inconsistent. To address this, we propose a structured rubric with five dimensions: coherence, consistency, fluency, relevance, and ordering, each defined with explicit sub-criteria to guide LLMs in assessing semantic fidelity and structural quality. Using a purpose-built dataset of Italian news summaries generated by GPT-4o, each tailored to isolate specific criteria, we evaluate LLMs’ ability to assign scores and rationales aligned with expert human judgments. Results show moderate alignment (Spearman’s \ensuremathρ = 0.6–0.7) for criteria like relevance but reveal systematic biases, such as overestimating fluency and coherence, likely due to training data biases. We identify challenges in rubric interpretation, particularly for hierarchical or abstract criteria, and highlight limitations in cross-genre generalization. The study underscores the potential of LLMs as scalable evaluators but emphasizes the need for fine-tuning, diverse benchmarks, and refined rubrics to mitigate biases and enhance reliability. Future directions include expanding to multilingual and multi-genre contexts and exploring task-specific instruction tuning to improve alignment with human editorial standards.
%U https://aclanthology.org/2025.luhme-1.9/
%P 85-102
Markdown (Informal)
[Do Large Language Models understand how to be judges?](https://aclanthology.org/2025.luhme-1.9/) (Donati et al., LUHME 2025)
ACL