@inproceedings{lin-etal-2025-judge,
title = "Do Before You Judge: Self-Reference as a Pathway to Better {LLM} Evaluation",
author = "Lin, Wei-Hsiang and
Wei, Sheng-Lun and
Huang, Hen-Hsen and
Chen, Hsin-Hsi",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1342/",
doi = "10.18653/v1/2025.findings-emnlp.1342",
pages = "24651--24672",
ISBN = "979-8-89176-335-7",
abstract = "LLM-as-Judge frameworks are increasingly popular for AI evaluation, yet research findings on the relationship between models' generation and judgment abilities remain inconsistent. We investigate this relationship through systematic dataset- and instance-level analyses across 11 models and 21 diverse tasks. Despite both capabilities relying on the same underlying knowledge, our analyses reveal they are only weakly correlated, primarily due to LLMs' sensitivity to the responses being judged. To address this, we propose a self-reference-guided evaluation strategy that leverages a model{'}s own answers as references. This approach significantly strengthens the correlation between generation and judgment abilities, offering a practical path to align these skills and providing a reliable proxy for model selection in evaluation tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2025-judge">
<titleInfo>
<title>Do Before You Judge: Self-Reference as a Pathway to Better LLM Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei-Hsiang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng-Lun</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hen-Hsen</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>LLM-as-Judge frameworks are increasingly popular for AI evaluation, yet research findings on the relationship between models’ generation and judgment abilities remain inconsistent. We investigate this relationship through systematic dataset- and instance-level analyses across 11 models and 21 diverse tasks. Despite both capabilities relying on the same underlying knowledge, our analyses reveal they are only weakly correlated, primarily due to LLMs’ sensitivity to the responses being judged. To address this, we propose a self-reference-guided evaluation strategy that leverages a model’s own answers as references. This approach significantly strengthens the correlation between generation and judgment abilities, offering a practical path to align these skills and providing a reliable proxy for model selection in evaluation tasks.</abstract>
<identifier type="citekey">lin-etal-2025-judge</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1342</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1342/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>24651</start>
<end>24672</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Before You Judge: Self-Reference as a Pathway to Better LLM Evaluation
%A Lin, Wei-Hsiang
%A Wei, Sheng-Lun
%A Huang, Hen-Hsen
%A Chen, Hsin-Hsi
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F lin-etal-2025-judge
%X LLM-as-Judge frameworks are increasingly popular for AI evaluation, yet research findings on the relationship between models’ generation and judgment abilities remain inconsistent. We investigate this relationship through systematic dataset- and instance-level analyses across 11 models and 21 diverse tasks. Despite both capabilities relying on the same underlying knowledge, our analyses reveal they are only weakly correlated, primarily due to LLMs’ sensitivity to the responses being judged. To address this, we propose a self-reference-guided evaluation strategy that leverages a model’s own answers as references. This approach significantly strengthens the correlation between generation and judgment abilities, offering a practical path to align these skills and providing a reliable proxy for model selection in evaluation tasks.
%R 10.18653/v1/2025.findings-emnlp.1342
%U https://aclanthology.org/2025.findings-emnlp.1342/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1342
%P 24651-24672
Markdown (Informal)
[Do Before You Judge: Self-Reference as a Pathway to Better LLM Evaluation](https://aclanthology.org/2025.findings-emnlp.1342/) (Lin et al., Findings 2025)
ACL