@inproceedings{zhen-etal-2025-enhancing,
title = "Enhancing {LLM}-as-a-Judge through Active-Sampling-based Prompt Optimization",
author = "Zhen, Cheng and
Zheng, Ervine and
Kuang, Jilong and
Tso, Geoffrey Jay",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.67/",
doi = "10.18653/v1/2025.acl-industry.67",
pages = "960--970",
ISBN = "979-8-89176-288-6",
abstract = "We introduce an active-sampling-based framework for automatic prompt optimization, designed to enhance the performance of Large Language Model (LLM)-as-a-judge systems, which use LLMs to evaluate the quality of text or other outputs, in label-scarce settings. Unlike existing approaches that rely on extensive annotations, our method starts with no labeled data and iteratively selects and labels a small, diverse, and informative subset of samples to guide prompt refinement. At each iteration, our method evaluates the current prompt based on selected data and automatically updates the prompt, enabling efficient prompt optimization with minimal supervision. Moreover, we formulate sample selection as a convex optimization problem that balances uncertainty and diversity, maximizing the utility of limited labeling budgets. We validate our framework across four popular LLMs and three real-world datasets, including one from a deployed industry product. Results show that our optimized prompts consistently outperform baselines, achieving significant gains in evaluation quality and robustness while substantially reducing labeling costs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhen-etal-2025-enhancing">
<titleInfo>
<title>Enhancing LLM-as-a-Judge through Active-Sampling-based Prompt Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Zhen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ervine</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jilong</namePart>
<namePart type="family">Kuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geoffrey</namePart>
<namePart type="given">Jay</namePart>
<namePart type="family">Tso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>We introduce an active-sampling-based framework for automatic prompt optimization, designed to enhance the performance of Large Language Model (LLM)-as-a-judge systems, which use LLMs to evaluate the quality of text or other outputs, in label-scarce settings. Unlike existing approaches that rely on extensive annotations, our method starts with no labeled data and iteratively selects and labels a small, diverse, and informative subset of samples to guide prompt refinement. At each iteration, our method evaluates the current prompt based on selected data and automatically updates the prompt, enabling efficient prompt optimization with minimal supervision. Moreover, we formulate sample selection as a convex optimization problem that balances uncertainty and diversity, maximizing the utility of limited labeling budgets. We validate our framework across four popular LLMs and three real-world datasets, including one from a deployed industry product. Results show that our optimized prompts consistently outperform baselines, achieving significant gains in evaluation quality and robustness while substantially reducing labeling costs.</abstract>
<identifier type="citekey">zhen-etal-2025-enhancing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.67</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.67/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>960</start>
<end>970</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing LLM-as-a-Judge through Active-Sampling-based Prompt Optimization
%A Zhen, Cheng
%A Zheng, Ervine
%A Kuang, Jilong
%A Tso, Geoffrey Jay
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F zhen-etal-2025-enhancing
%X We introduce an active-sampling-based framework for automatic prompt optimization, designed to enhance the performance of Large Language Model (LLM)-as-a-judge systems, which use LLMs to evaluate the quality of text or other outputs, in label-scarce settings. Unlike existing approaches that rely on extensive annotations, our method starts with no labeled data and iteratively selects and labels a small, diverse, and informative subset of samples to guide prompt refinement. At each iteration, our method evaluates the current prompt based on selected data and automatically updates the prompt, enabling efficient prompt optimization with minimal supervision. Moreover, we formulate sample selection as a convex optimization problem that balances uncertainty and diversity, maximizing the utility of limited labeling budgets. We validate our framework across four popular LLMs and three real-world datasets, including one from a deployed industry product. Results show that our optimized prompts consistently outperform baselines, achieving significant gains in evaluation quality and robustness while substantially reducing labeling costs.
%R 10.18653/v1/2025.acl-industry.67
%U https://aclanthology.org/2025.acl-industry.67/
%U https://doi.org/10.18653/v1/2025.acl-industry.67
%P 960-970
Markdown (Informal)
[Enhancing LLM-as-a-Judge through Active-Sampling-based Prompt Optimization](https://aclanthology.org/2025.acl-industry.67/) (Zhen et al., ACL 2025)
ACL