@inproceedings{romano-etal-2025-evaluating,
title = "Evaluating the Role of Verifiers in Test-Time Scaling for Legal Reasoning Tasks",
author = "Romano, Davide and
Schwarz, Jonathan Richard and
Giofr{\`e}, Daniele",
editor = "Aletras, Nikolaos and
Chalkidis, Ilias and
Barrett, Leslie and
Goanț{\u{a}}, C{\u{a}}t{\u{a}}lina and
Preoțiuc-Pietro, Daniel and
Spanakis, Gerasimos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nllp-1.15/",
pages = "207--225",
ISBN = "979-8-89176-338-8",
abstract = "Test-time scaling (TTS) techniques can improve the performance of large language models (LLMs) at the expense of additional computation and latency. While TTS has proven effective in formal domains such as mathematics and programming (Snell et al., 2024; Chen et al., 2024), its value in argumentative domains such as law remains underexplored. We present an empirical study of verifier-based TTS methods for legal multiple-choice QA (MCQA) across five benchmarks. Using a family of 7 reward models, we evaluate both outcome-level (Best-of-$N$) and process-level (tree search) verification under realistic low-$N$ budgets. Our analysis systematically investigates how verifier utility is affected by key properties such as domain specialization, model size, and supervision type (process-supervised PRMs vs. outcome-only ORMs), even when applied across different roles."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="romano-etal-2025-evaluating">
<titleInfo>
<title>Evaluating the Role of Verifiers in Test-Time Scaling for Legal Reasoning Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Davide</namePart>
<namePart type="family">Romano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="given">Richard</namePart>
<namePart type="family">Schwarz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniele</namePart>
<namePart type="family">Giofrè</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Natural Legal Language Processing Workshop 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilias</namePart>
<namePart type="family">Chalkidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leslie</namePart>
<namePart type="family">Barrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cătălina</namePart>
<namePart type="family">Goanță</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoțiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Spanakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-338-8</identifier>
</relatedItem>
<abstract>Test-time scaling (TTS) techniques can improve the performance of large language models (LLMs) at the expense of additional computation and latency. While TTS has proven effective in formal domains such as mathematics and programming (Snell et al., 2024; Chen et al., 2024), its value in argumentative domains such as law remains underexplored. We present an empirical study of verifier-based TTS methods for legal multiple-choice QA (MCQA) across five benchmarks. Using a family of 7 reward models, we evaluate both outcome-level (Best-of-N) and process-level (tree search) verification under realistic low-N budgets. Our analysis systematically investigates how verifier utility is affected by key properties such as domain specialization, model size, and supervision type (process-supervised PRMs vs. outcome-only ORMs), even when applied across different roles.</abstract>
<identifier type="citekey">romano-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.nllp-1.15/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>207</start>
<end>225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating the Role of Verifiers in Test-Time Scaling for Legal Reasoning Tasks
%A Romano, Davide
%A Schwarz, Jonathan Richard
%A Giofrè, Daniele
%Y Aletras, Nikolaos
%Y Chalkidis, Ilias
%Y Barrett, Leslie
%Y Goanță, Cătălina
%Y Preoțiuc-Pietro, Daniel
%Y Spanakis, Gerasimos
%S Proceedings of the Natural Legal Language Processing Workshop 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-338-8
%F romano-etal-2025-evaluating
%X Test-time scaling (TTS) techniques can improve the performance of large language models (LLMs) at the expense of additional computation and latency. While TTS has proven effective in formal domains such as mathematics and programming (Snell et al., 2024; Chen et al., 2024), its value in argumentative domains such as law remains underexplored. We present an empirical study of verifier-based TTS methods for legal multiple-choice QA (MCQA) across five benchmarks. Using a family of 7 reward models, we evaluate both outcome-level (Best-of-N) and process-level (tree search) verification under realistic low-N budgets. Our analysis systematically investigates how verifier utility is affected by key properties such as domain specialization, model size, and supervision type (process-supervised PRMs vs. outcome-only ORMs), even when applied across different roles.
%U https://aclanthology.org/2025.nllp-1.15/
%P 207-225
Markdown (Informal)
[Evaluating the Role of Verifiers in Test-Time Scaling for Legal Reasoning Tasks](https://aclanthology.org/2025.nllp-1.15/) (Romano et al., NLLP 2025)
ACL