@inproceedings{purushothama-etal-2025-ready,
title = "Not ready for the bench: {LLM} legal interpretation is unstable and uncalibrated to human judgments",
author = "Purushothama, Abhishek and
Min, Junghyun and
Waldon, Brandon and
Schneider, Nathan",
editor = "Aletras, Nikolaos and
Chalkidis, Ilias and
Barrett, Leslie and
Goanț{\u{a}}, C{\u{a}}t{\u{a}}lina and
Preoțiuc-Pietro, Daniel and
Spanakis, Gerasimos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nllp-1.22/",
pages = "317--317",
ISBN = "979-8-89176-338-8",
abstract = "Legal interpretation frequently involves assessing how a legal text, as understood by an `ordinary' speaker of the language, applies to the set of facts characterizing a legal dispute. Recent scholarship has proposed that legal practitioners add large language models (LLMs) to their interpretive toolkit. This work offers an empirical argument against LLM-assisted interpretation as recently practiced by legal scholars and federal judges. Our investigation in English shows that models do not provide stable interpretive judgments and are susceptible to subtle variations in the prompt. While instruction tuning slightly improves model calibration to human judgments, even the best-calibrated LLMs remain weak predictors of human native speakers' judgments."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="purushothama-etal-2025-ready">
<titleInfo>
<title>Not ready for the bench: LLM legal interpretation is unstable and uncalibrated to human judgments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Purushothama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junghyun</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brandon</namePart>
<namePart type="family">Waldon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Natural Legal Language Processing Workshop 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilias</namePart>
<namePart type="family">Chalkidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leslie</namePart>
<namePart type="family">Barrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cătălina</namePart>
<namePart type="family">Goanță</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoțiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Spanakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-338-8</identifier>
</relatedItem>
<abstract>Legal interpretation frequently involves assessing how a legal text, as understood by an ‘ordinary’ speaker of the language, applies to the set of facts characterizing a legal dispute. Recent scholarship has proposed that legal practitioners add large language models (LLMs) to their interpretive toolkit. This work offers an empirical argument against LLM-assisted interpretation as recently practiced by legal scholars and federal judges. Our investigation in English shows that models do not provide stable interpretive judgments and are susceptible to subtle variations in the prompt. While instruction tuning slightly improves model calibration to human judgments, even the best-calibrated LLMs remain weak predictors of human native speakers’ judgments.</abstract>
<identifier type="citekey">purushothama-etal-2025-ready</identifier>
<location>
<url>https://aclanthology.org/2025.nllp-1.22/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>317</start>
<end>317</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Not ready for the bench: LLM legal interpretation is unstable and uncalibrated to human judgments
%A Purushothama, Abhishek
%A Min, Junghyun
%A Waldon, Brandon
%A Schneider, Nathan
%Y Aletras, Nikolaos
%Y Chalkidis, Ilias
%Y Barrett, Leslie
%Y Goanță, Cătălina
%Y Preoțiuc-Pietro, Daniel
%Y Spanakis, Gerasimos
%S Proceedings of the Natural Legal Language Processing Workshop 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-338-8
%F purushothama-etal-2025-ready
%X Legal interpretation frequently involves assessing how a legal text, as understood by an ‘ordinary’ speaker of the language, applies to the set of facts characterizing a legal dispute. Recent scholarship has proposed that legal practitioners add large language models (LLMs) to their interpretive toolkit. This work offers an empirical argument against LLM-assisted interpretation as recently practiced by legal scholars and federal judges. Our investigation in English shows that models do not provide stable interpretive judgments and are susceptible to subtle variations in the prompt. While instruction tuning slightly improves model calibration to human judgments, even the best-calibrated LLMs remain weak predictors of human native speakers’ judgments.
%U https://aclanthology.org/2025.nllp-1.22/
%P 317-317
Markdown (Informal)
[Not ready for the bench: LLM legal interpretation is unstable and uncalibrated to human judgments](https://aclanthology.org/2025.nllp-1.22/) (Purushothama et al., NLLP 2025)
ACL