@inproceedings{chen-etal-2026-jurisbench,
title = "{J}uris{B}ench: A Deep Benchmark for Assessing Large Language Models in Professional Legal Practice",
author = "Chen, Ziang and
Li, Guannan and
Ji, Fanlin and
Kang, Yipeng and
Li, Jiaqi and
Zhang, Muhan and
Zhang, Yangtao and
Tianjiao, Li and
Wang, Jiannan and
Guo, Xin and
Zhu, Song-Chun and
Ling, Bin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1666/",
pages = "35994--36018",
ISBN = "979-8-89176-390-6",
abstract = "Large Language Models (LLMs) have demonstrated strong cross-domain capabilities, yet their competence in specialized professional tasks remains underexamined. Existing legal benchmarks evaluate isolated tasks or exam-style questions, failing to capture the procedural interdependencies and adjudicative rigor inherent in professional practice. To bridge this gap, we construct JurisBench, a vertical, depth-oriented, domain-specific benchmark designed to evaluate LLMs across key stages of Chinese civil litigation. JurisBench introduces a Linear Depth Simulation track that mirrors the cognitive workflow of professional judges through four sequential, dependency-aware phases: Cause of Action prediction, Focus of Disputes identification, Rationale of the Judgment generation, and Result of the Judgment determination. Results reveal an ``illusion of competence'': state-of-the-art models exhibit marked performance degradation in end-to-end pipelines due to cascading error propagation. We identify precise statutory grounding as a persistent bottleneck, highlighting a critical gap between fluent linguistic output and judicial reliability. JurisBench shifts evaluation from isolated legal knowledge to workflow-level task execution, providing a diagnostic framework for legal AI and a template for benchmark design in specialized domains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-jurisbench">
<titleInfo>
<title>JurisBench: A Deep Benchmark for Assessing Large Language Models in Professional Legal Practice</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ziang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guannan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanlin</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yipeng</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangtao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Tianjiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiannan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Song-Chun</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Ling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have demonstrated strong cross-domain capabilities, yet their competence in specialized professional tasks remains underexamined. Existing legal benchmarks evaluate isolated tasks or exam-style questions, failing to capture the procedural interdependencies and adjudicative rigor inherent in professional practice. To bridge this gap, we construct JurisBench, a vertical, depth-oriented, domain-specific benchmark designed to evaluate LLMs across key stages of Chinese civil litigation. JurisBench introduces a Linear Depth Simulation track that mirrors the cognitive workflow of professional judges through four sequential, dependency-aware phases: Cause of Action prediction, Focus of Disputes identification, Rationale of the Judgment generation, and Result of the Judgment determination. Results reveal an “illusion of competence”: state-of-the-art models exhibit marked performance degradation in end-to-end pipelines due to cascading error propagation. We identify precise statutory grounding as a persistent bottleneck, highlighting a critical gap between fluent linguistic output and judicial reliability. JurisBench shifts evaluation from isolated legal knowledge to workflow-level task execution, providing a diagnostic framework for legal AI and a template for benchmark design in specialized domains.</abstract>
<identifier type="citekey">chen-etal-2026-jurisbench</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1666/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35994</start>
<end>36018</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JurisBench: A Deep Benchmark for Assessing Large Language Models in Professional Legal Practice
%A Chen, Ziang
%A Li, Guannan
%A Ji, Fanlin
%A Kang, Yipeng
%A Li, Jiaqi
%A Zhang, Muhan
%A Zhang, Yangtao
%A Tianjiao, Li
%A Wang, Jiannan
%A Guo, Xin
%A Zhu, Song-Chun
%A Ling, Bin
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F chen-etal-2026-jurisbench
%X Large Language Models (LLMs) have demonstrated strong cross-domain capabilities, yet their competence in specialized professional tasks remains underexamined. Existing legal benchmarks evaluate isolated tasks or exam-style questions, failing to capture the procedural interdependencies and adjudicative rigor inherent in professional practice. To bridge this gap, we construct JurisBench, a vertical, depth-oriented, domain-specific benchmark designed to evaluate LLMs across key stages of Chinese civil litigation. JurisBench introduces a Linear Depth Simulation track that mirrors the cognitive workflow of professional judges through four sequential, dependency-aware phases: Cause of Action prediction, Focus of Disputes identification, Rationale of the Judgment generation, and Result of the Judgment determination. Results reveal an “illusion of competence”: state-of-the-art models exhibit marked performance degradation in end-to-end pipelines due to cascading error propagation. We identify precise statutory grounding as a persistent bottleneck, highlighting a critical gap between fluent linguistic output and judicial reliability. JurisBench shifts evaluation from isolated legal knowledge to workflow-level task execution, providing a diagnostic framework for legal AI and a template for benchmark design in specialized domains.
%U https://aclanthology.org/2026.acl-long.1666/
%P 35994-36018
Markdown (Informal)
[JurisBench: A Deep Benchmark for Assessing Large Language Models in Professional Legal Practice](https://aclanthology.org/2026.acl-long.1666/) (Chen et al., ACL 2026)
ACL
- Ziang Chen, Guannan Li, Fanlin Ji, Yipeng Kang, Jiaqi Li, Muhan Zhang, Yangtao Zhang, Li Tianjiao, Jiannan Wang, Xin Guo, Song-Chun Zhu, and Bin Ling. 2026. JurisBench: A Deep Benchmark for Assessing Large Language Models in Professional Legal Practice. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 35994–36018, San Diego, California, United States. Association for Computational Linguistics.