@inproceedings{li-etal-2025-llm-judge,
title = "{LLM}-as-a-Judge Failures at Automating the Identification of Poor Quality Outputs in Free-Form Texts",
author = "Li, Zongxia and
Wu, Xiyang and
Mondal, Ishani and
Siu, Alexa and
Boyd-Graber, Jordan Lee and
Nenkova, Ani",
editor = "Dong, Yue and
Xiao, Wen and
Zhang, Haopeng and
Zhang, Rui and
Ernst, Ori and
Wang, Lu and
Liu, Fei",
booktitle = "Proceedings of The 5th New Frontiers in Summarization Workshop",
month = nov,
year = "2025",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.newsum-main.1/",
pages = "1--16",
ISBN = "979-8-89176-337-1",
abstract = "Large language models (LLMs) such as GPT-4, Claude and LLaMA are routinely used to evaluate long-form text generated by language models. We study the ability of these models to identify low quality texts, an increasingly rare subset of output which is of great interest to pinpoint during development. We present experiments with a panel of LLM judges, and crowd-sourced approximations of reference judgments. Pinpointing sub-par outputs is a difficult task for both models and crowdworkers, with models doing overall better. Moreover, unlike findings in prior work on factoid question answering, panels of cheaper models do not agree as well with high quality developer judgments of low quality as panels of frontier models. We present qualitative and quantitative analysis of the relative strengths of models in the panel, gleaning insights why they yield better results over a single model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-llm-judge">
<titleInfo>
<title>LLM-as-a-Judge Failures at Automating the Identification of Poor Quality Outputs in Free-Form Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zongxia</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiyang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ishani</namePart>
<namePart type="family">Mondal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexa</namePart>
<namePart type="family">Siu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="given">Lee</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ani</namePart>
<namePart type="family">Nenkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The 5th New Frontiers in Summarization Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haopeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ori</namePart>
<namePart type="family">Ernst</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-337-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) such as GPT-4, Claude and LLaMA are routinely used to evaluate long-form text generated by language models. We study the ability of these models to identify low quality texts, an increasingly rare subset of output which is of great interest to pinpoint during development. We present experiments with a panel of LLM judges, and crowd-sourced approximations of reference judgments. Pinpointing sub-par outputs is a difficult task for both models and crowdworkers, with models doing overall better. Moreover, unlike findings in prior work on factoid question answering, panels of cheaper models do not agree as well with high quality developer judgments of low quality as panels of frontier models. We present qualitative and quantitative analysis of the relative strengths of models in the panel, gleaning insights why they yield better results over a single model.</abstract>
<identifier type="citekey">li-etal-2025-llm-judge</identifier>
<location>
<url>https://aclanthology.org/2025.newsum-main.1/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM-as-a-Judge Failures at Automating the Identification of Poor Quality Outputs in Free-Form Texts
%A Li, Zongxia
%A Wu, Xiyang
%A Mondal, Ishani
%A Siu, Alexa
%A Boyd-Graber, Jordan Lee
%A Nenkova, Ani
%Y Dong, Yue
%Y Xiao, Wen
%Y Zhang, Haopeng
%Y Zhang, Rui
%Y Ernst, Ori
%Y Wang, Lu
%Y Liu, Fei
%S Proceedings of The 5th New Frontiers in Summarization Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Hybrid
%@ 979-8-89176-337-1
%F li-etal-2025-llm-judge
%X Large language models (LLMs) such as GPT-4, Claude and LLaMA are routinely used to evaluate long-form text generated by language models. We study the ability of these models to identify low quality texts, an increasingly rare subset of output which is of great interest to pinpoint during development. We present experiments with a panel of LLM judges, and crowd-sourced approximations of reference judgments. Pinpointing sub-par outputs is a difficult task for both models and crowdworkers, with models doing overall better. Moreover, unlike findings in prior work on factoid question answering, panels of cheaper models do not agree as well with high quality developer judgments of low quality as panels of frontier models. We present qualitative and quantitative analysis of the relative strengths of models in the panel, gleaning insights why they yield better results over a single model.
%U https://aclanthology.org/2025.newsum-main.1/
%P 1-16
Markdown (Informal)
[LLM-as-a-Judge Failures at Automating the Identification of Poor Quality Outputs in Free-Form Texts](https://aclanthology.org/2025.newsum-main.1/) (Li et al., NewSum 2025)
ACL