@inproceedings{chlapanis-etal-2025-greekbarbench,
title = "{G}reek{B}ar{B}ench: A Challenging Benchmark for Free-Text Legal Reasoning and Citations",
author = "Chlapanis, Odysseas S. and
Galanis, Dimitris and
Aletras, Nikolaos and
Androutsopoulos, Ion",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1368/",
pages = "25099--25119",
ISBN = "979-8-89176-335-7",
abstract = "We introduce GreekBarBench, a benchmark that evaluates LLMs on legal questions across five different legal areas from the Greek Bar exams, requiring citations to statutory articles and case facts. To tackle the challenges of free-text evaluation, we propose a three-dimensional scoring system combined with an LLM-as-a-judge approach. We also develop a meta-evaluation benchmark to assess the correlation between LLM-judges and human expert evaluations, revealing that simple, span-based rubrics improve their alignment. Our extensive evaluation of 13 proprietary and open-weight LLMs shows that even though the top models exhibit impressive performance, they remain susceptible to critical errors, most notably a failure to identify the correct statutory articles."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chlapanis-etal-2025-greekbarbench">
<titleInfo>
<title>GreekBarBench: A Challenging Benchmark for Free-Text Legal Reasoning and Citations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Odysseas</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Chlapanis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitris</namePart>
<namePart type="family">Galanis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ion</namePart>
<namePart type="family">Androutsopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>We introduce GreekBarBench, a benchmark that evaluates LLMs on legal questions across five different legal areas from the Greek Bar exams, requiring citations to statutory articles and case facts. To tackle the challenges of free-text evaluation, we propose a three-dimensional scoring system combined with an LLM-as-a-judge approach. We also develop a meta-evaluation benchmark to assess the correlation between LLM-judges and human expert evaluations, revealing that simple, span-based rubrics improve their alignment. Our extensive evaluation of 13 proprietary and open-weight LLMs shows that even though the top models exhibit impressive performance, they remain susceptible to critical errors, most notably a failure to identify the correct statutory articles.</abstract>
<identifier type="citekey">chlapanis-etal-2025-greekbarbench</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1368/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25099</start>
<end>25119</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GreekBarBench: A Challenging Benchmark for Free-Text Legal Reasoning and Citations
%A Chlapanis, Odysseas S.
%A Galanis, Dimitris
%A Aletras, Nikolaos
%A Androutsopoulos, Ion
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F chlapanis-etal-2025-greekbarbench
%X We introduce GreekBarBench, a benchmark that evaluates LLMs on legal questions across five different legal areas from the Greek Bar exams, requiring citations to statutory articles and case facts. To tackle the challenges of free-text evaluation, we propose a three-dimensional scoring system combined with an LLM-as-a-judge approach. We also develop a meta-evaluation benchmark to assess the correlation between LLM-judges and human expert evaluations, revealing that simple, span-based rubrics improve their alignment. Our extensive evaluation of 13 proprietary and open-weight LLMs shows that even though the top models exhibit impressive performance, they remain susceptible to critical errors, most notably a failure to identify the correct statutory articles.
%U https://aclanthology.org/2025.findings-emnlp.1368/
%P 25099-25119
Markdown (Informal)
[GreekBarBench: A Challenging Benchmark for Free-Text Legal Reasoning and Citations](https://aclanthology.org/2025.findings-emnlp.1368/) (Chlapanis et al., Findings 2025)
ACL