@inproceedings{chu-etal-2025-unpacking,
title = "Unpacking Legal Reasoning in {LLM}s: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based {NLU} Tasks",
author = "Chu, Yu Ying and
Huang, Sieh-chuen and
Shao, Hsuan-Lei",
editor = "Abzianidze, Lasha and
de Paiva, Valeria",
booktitle = "Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)",
month = aug,
year = "2025",
address = "Bochum, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naloma-1.1/",
pages = "1--7",
ISBN = "979-8-89176-287-9",
abstract = "This study evaluates how Large Language Models (LLMs) perform deep legal reasoning on Taiwanese Status Law questions and investigates how Chain-of-Thought (CoT) prompting affects interpretability, alignment, and generalization. Using a two-stage evaluation framework, we first decomposed six real legal essay questions into 68 sub-questions covering issue spotting, statutory application, and inheritance computation. In Stage Two, full-length answers were collected under baseline and CoT-prompted conditions. Four LLMs{---}ChatGPT-4o, Gemini, Grok3, and Copilot{---}were tested. Results show CoT prompting significantly improved accuracy for Gemini (from 83.2{\%} to 94.5{\%}, p {\ensuremath{<}} 0.05) and Grok3, with moderate but consistent gains for ChatGPT and Copilot. Human evaluation of full-length responses revealed CoT answers received notably higher scores in issue coverage and reasoning clarity, with ChatGPT and Gemini gaining +2.67 and +1.92 points respectively. Despite these gains, legal misclassifications persist, highlighting alignment gaps between surface-level fluency and expert legal reasoning. This work opens the black box of legal NLU by tracing LLM reasoning chains, quantifying performance shifts under structured prompting, and providing a diagnostic benchmark for complex, open-ended legal tasks beyond multiple-choice settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chu-etal-2025-unpacking">
<titleInfo>
<title>Unpacking Legal Reasoning in LLMs: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based NLU Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="given">Ying</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sieh-chuen</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsuan-Lei</namePart>
<namePart type="family">Shao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lasha</namePart>
<namePart type="family">Abzianidze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="family">de Paiva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bochum, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-287-9</identifier>
</relatedItem>
<abstract>This study evaluates how Large Language Models (LLMs) perform deep legal reasoning on Taiwanese Status Law questions and investigates how Chain-of-Thought (CoT) prompting affects interpretability, alignment, and generalization. Using a two-stage evaluation framework, we first decomposed six real legal essay questions into 68 sub-questions covering issue spotting, statutory application, and inheritance computation. In Stage Two, full-length answers were collected under baseline and CoT-prompted conditions. Four LLMs—ChatGPT-4o, Gemini, Grok3, and Copilot—were tested. Results show CoT prompting significantly improved accuracy for Gemini (from 83.2% to 94.5%, p \ensuremath< 0.05) and Grok3, with moderate but consistent gains for ChatGPT and Copilot. Human evaluation of full-length responses revealed CoT answers received notably higher scores in issue coverage and reasoning clarity, with ChatGPT and Gemini gaining +2.67 and +1.92 points respectively. Despite these gains, legal misclassifications persist, highlighting alignment gaps between surface-level fluency and expert legal reasoning. This work opens the black box of legal NLU by tracing LLM reasoning chains, quantifying performance shifts under structured prompting, and providing a diagnostic benchmark for complex, open-ended legal tasks beyond multiple-choice settings.</abstract>
<identifier type="citekey">chu-etal-2025-unpacking</identifier>
<location>
<url>https://aclanthology.org/2025.naloma-1.1/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>1</start>
<end>7</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unpacking Legal Reasoning in LLMs: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based NLU Tasks
%A Chu, Yu Ying
%A Huang, Sieh-chuen
%A Shao, Hsuan-Lei
%Y Abzianidze, Lasha
%Y de Paiva, Valeria
%S Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Bochum, Germany
%@ 979-8-89176-287-9
%F chu-etal-2025-unpacking
%X This study evaluates how Large Language Models (LLMs) perform deep legal reasoning on Taiwanese Status Law questions and investigates how Chain-of-Thought (CoT) prompting affects interpretability, alignment, and generalization. Using a two-stage evaluation framework, we first decomposed six real legal essay questions into 68 sub-questions covering issue spotting, statutory application, and inheritance computation. In Stage Two, full-length answers were collected under baseline and CoT-prompted conditions. Four LLMs—ChatGPT-4o, Gemini, Grok3, and Copilot—were tested. Results show CoT prompting significantly improved accuracy for Gemini (from 83.2% to 94.5%, p \ensuremath< 0.05) and Grok3, with moderate but consistent gains for ChatGPT and Copilot. Human evaluation of full-length responses revealed CoT answers received notably higher scores in issue coverage and reasoning clarity, with ChatGPT and Gemini gaining +2.67 and +1.92 points respectively. Despite these gains, legal misclassifications persist, highlighting alignment gaps between surface-level fluency and expert legal reasoning. This work opens the black box of legal NLU by tracing LLM reasoning chains, quantifying performance shifts under structured prompting, and providing a diagnostic benchmark for complex, open-ended legal tasks beyond multiple-choice settings.
%U https://aclanthology.org/2025.naloma-1.1/
%P 1-7
Markdown (Informal)
[Unpacking Legal Reasoning in LLMs: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based NLU Tasks](https://aclanthology.org/2025.naloma-1.1/) (Chu et al., NALOMA 2025)
ACL