@inproceedings{shi-etal-2026-judgeagent,
title = "{J}udge{A}gent: Beyond Static Benchmarks for Knowledge-Driven and Dynamic {LLM} Evaluation",
author = "Shi, Zhichao and
Jiang, Xuhui and
Xu, Chengjin and
Yao, Cangli and
Ma, Shengjie and
Shen, Yinghan and
Li, Zixuan and
Guo, Jian and
Wang, Yuanzhuo",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.634/",
pages = "13004--13030",
ISBN = "979-8-89176-395-1",
abstract = "Current evaluation methods for large language models (LLMs) primarily rely on static benchmarks, presenting two major challenges: limited knowledge coverage and fixed difficulties that mismatch with the evaluated LLMs. These limitations lead to superficial assessments of LLM knowledge, thereby impeding the targeted model optimizations.To bridge this gap, we propose JudgeAgent, a knowledge-driven and dynamic evaluation framework for LLMs.To address the challenge of limited knowledge coverage, JudgeAgent leverages LLM agents equipped with context graphs to traverse knowledge structures systematically for question generation.Furthermore, to mitigate data contamination and difficulty mismatch, it adopts a difficulty-adaptive and multi-turn interview mechanism.Thereby, JudgeAgent can achieve comprehensive evaluations and facilitate more effective improvement of LLMs.Empirical results demonstrate that JudgeAgent enables more comprehensive evaluations and facilitates effective model iterations, highlighting the potential of this knowledge-driven and dynamic evaluation paradigm.The source code is available on https://github.com/DataArcTech/JudgeAgent."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2026-judgeagent">
<titleInfo>
<title>JudgeAgent: Beyond Static Benchmarks for Knowledge-Driven and Dynamic LLM Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhichao</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuhui</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengjin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cangli</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengjie</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinghan</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zixuan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanzhuo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Current evaluation methods for large language models (LLMs) primarily rely on static benchmarks, presenting two major challenges: limited knowledge coverage and fixed difficulties that mismatch with the evaluated LLMs. These limitations lead to superficial assessments of LLM knowledge, thereby impeding the targeted model optimizations.To bridge this gap, we propose JudgeAgent, a knowledge-driven and dynamic evaluation framework for LLMs.To address the challenge of limited knowledge coverage, JudgeAgent leverages LLM agents equipped with context graphs to traverse knowledge structures systematically for question generation.Furthermore, to mitigate data contamination and difficulty mismatch, it adopts a difficulty-adaptive and multi-turn interview mechanism.Thereby, JudgeAgent can achieve comprehensive evaluations and facilitate more effective improvement of LLMs.Empirical results demonstrate that JudgeAgent enables more comprehensive evaluations and facilitates effective model iterations, highlighting the potential of this knowledge-driven and dynamic evaluation paradigm.The source code is available on https://github.com/DataArcTech/JudgeAgent.</abstract>
<identifier type="citekey">shi-etal-2026-judgeagent</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.634/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>13004</start>
<end>13030</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JudgeAgent: Beyond Static Benchmarks for Knowledge-Driven and Dynamic LLM Evaluation
%A Shi, Zhichao
%A Jiang, Xuhui
%A Xu, Chengjin
%A Yao, Cangli
%A Ma, Shengjie
%A Shen, Yinghan
%A Li, Zixuan
%A Guo, Jian
%A Wang, Yuanzhuo
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F shi-etal-2026-judgeagent
%X Current evaluation methods for large language models (LLMs) primarily rely on static benchmarks, presenting two major challenges: limited knowledge coverage and fixed difficulties that mismatch with the evaluated LLMs. These limitations lead to superficial assessments of LLM knowledge, thereby impeding the targeted model optimizations.To bridge this gap, we propose JudgeAgent, a knowledge-driven and dynamic evaluation framework for LLMs.To address the challenge of limited knowledge coverage, JudgeAgent leverages LLM agents equipped with context graphs to traverse knowledge structures systematically for question generation.Furthermore, to mitigate data contamination and difficulty mismatch, it adopts a difficulty-adaptive and multi-turn interview mechanism.Thereby, JudgeAgent can achieve comprehensive evaluations and facilitate more effective improvement of LLMs.Empirical results demonstrate that JudgeAgent enables more comprehensive evaluations and facilitates effective model iterations, highlighting the potential of this knowledge-driven and dynamic evaluation paradigm.The source code is available on https://github.com/DataArcTech/JudgeAgent.
%U https://aclanthology.org/2026.findings-acl.634/
%P 13004-13030
Markdown (Informal)
[JudgeAgent: Beyond Static Benchmarks for Knowledge-Driven and Dynamic LLM Evaluation](https://aclanthology.org/2026.findings-acl.634/) (Shi et al., Findings 2026)
ACL
- Zhichao Shi, Xuhui Jiang, Chengjin Xu, Cangli Yao, Shengjie Ma, Yinghan Shen, Zixuan Li, Jian Guo, and Yuanzhuo Wang. 2026. JudgeAgent: Beyond Static Benchmarks for Knowledge-Driven and Dynamic LLM Evaluation. In Findings of the Association for Computational Linguistics: ACL 2026, pages 13004–13030, San Diego, California, United States. Association for Computational Linguistics.