@inproceedings{leng-etal-2025-praetor,
title = "Praetor: A Fine-Grained Generative {LLM} Evaluator with Instance-Level Customizable Evaluation Criteria",
author = "Leng, Yongqi and
Jin, Renren and
Chen, Yue and
Han, Zhuowen and
Shi, Ling and
Peng, Jianxiang and
Yang, Lei and
Xiao, Juesi and
Xiong, Deyi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.513/",
doi = "10.18653/v1/2025.acl-long.513",
pages = "10386--10418",
ISBN = "979-8-89176-251-0",
abstract = "With the increasing capability of large language models (LLMs), LLM-as-a-judge has emerged as a new evaluation paradigm. Compared with traditional automatic and manual evaluation, LLM evaluators exhibit better interpretability and efficiency. Despite this, existing LLM evaluators suffer from limited use scenarios and poor flexibility. To mitigate these issues, we propose Praetor, a fine-grained generative LLM evaluator with instance-level customazable evaluation criteria. To train Praetor, we curate a large-scale dataset guided with a hierarchical guideline covering a wide range of tasks and instance-level evaluation criteria. We train Praetor on this dataset in a multi-task learning fashion, which enables to evaluate LLMs in either pointwise grading or pairwise comparison way and support two languages simultaneously with a high flexibility of setting evaluation criteria. Extensive experiments demonstrate that Praetor outperforms previous LLM evaluators and instruction-tuned LLMs on multiple benchmarks, setting new SOTA results. It also exhibits the potential for generating critiques as scalable feedback to further improve LLMs. Our model and related resources are released at \url{https://github.com/tjunlp-lab/Praetor}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leng-etal-2025-praetor">
<titleInfo>
<title>Praetor: A Fine-Grained Generative LLM Evaluator with Instance-Level Customizable Evaluation Criteria</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongqi</namePart>
<namePart type="family">Leng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renren</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuowen</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ling</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianxiang</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juesi</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deyi</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>With the increasing capability of large language models (LLMs), LLM-as-a-judge has emerged as a new evaluation paradigm. Compared with traditional automatic and manual evaluation, LLM evaluators exhibit better interpretability and efficiency. Despite this, existing LLM evaluators suffer from limited use scenarios and poor flexibility. To mitigate these issues, we propose Praetor, a fine-grained generative LLM evaluator with instance-level customazable evaluation criteria. To train Praetor, we curate a large-scale dataset guided with a hierarchical guideline covering a wide range of tasks and instance-level evaluation criteria. We train Praetor on this dataset in a multi-task learning fashion, which enables to evaluate LLMs in either pointwise grading or pairwise comparison way and support two languages simultaneously with a high flexibility of setting evaluation criteria. Extensive experiments demonstrate that Praetor outperforms previous LLM evaluators and instruction-tuned LLMs on multiple benchmarks, setting new SOTA results. It also exhibits the potential for generating critiques as scalable feedback to further improve LLMs. Our model and related resources are released at https://github.com/tjunlp-lab/Praetor.</abstract>
<identifier type="citekey">leng-etal-2025-praetor</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.513</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.513/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>10386</start>
<end>10418</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Praetor: A Fine-Grained Generative LLM Evaluator with Instance-Level Customizable Evaluation Criteria
%A Leng, Yongqi
%A Jin, Renren
%A Chen, Yue
%A Han, Zhuowen
%A Shi, Ling
%A Peng, Jianxiang
%A Yang, Lei
%A Xiao, Juesi
%A Xiong, Deyi
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F leng-etal-2025-praetor
%X With the increasing capability of large language models (LLMs), LLM-as-a-judge has emerged as a new evaluation paradigm. Compared with traditional automatic and manual evaluation, LLM evaluators exhibit better interpretability and efficiency. Despite this, existing LLM evaluators suffer from limited use scenarios and poor flexibility. To mitigate these issues, we propose Praetor, a fine-grained generative LLM evaluator with instance-level customazable evaluation criteria. To train Praetor, we curate a large-scale dataset guided with a hierarchical guideline covering a wide range of tasks and instance-level evaluation criteria. We train Praetor on this dataset in a multi-task learning fashion, which enables to evaluate LLMs in either pointwise grading or pairwise comparison way and support two languages simultaneously with a high flexibility of setting evaluation criteria. Extensive experiments demonstrate that Praetor outperforms previous LLM evaluators and instruction-tuned LLMs on multiple benchmarks, setting new SOTA results. It also exhibits the potential for generating critiques as scalable feedback to further improve LLMs. Our model and related resources are released at https://github.com/tjunlp-lab/Praetor.
%R 10.18653/v1/2025.acl-long.513
%U https://aclanthology.org/2025.acl-long.513/
%U https://doi.org/10.18653/v1/2025.acl-long.513
%P 10386-10418
Markdown (Informal)
[Praetor: A Fine-Grained Generative LLM Evaluator with Instance-Level Customizable Evaluation Criteria](https://aclanthology.org/2025.acl-long.513/) (Leng et al., ACL 2025)
ACL
- Yongqi Leng, Renren Jin, Yue Chen, Zhuowen Han, Ling Shi, Jianxiang Peng, Lei Yang, Juesi Xiao, and Deyi Xiong. 2025. Praetor: A Fine-Grained Generative LLM Evaluator with Instance-Level Customizable Evaluation Criteria. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 10386–10418, Vienna, Austria. Association for Computational Linguistics.