@inproceedings{banar-etal-2026-blleqa,
title = "b{LL}e{QA}: Benchmarking {LLM}s for Grounded Legal Question-Answering in {F}rench and {D}utch",
author = "Banar, Nikolay and
Lotfi, Ehsan and
Van Nooten, Jens and
Kliocaite, Marija and
Daelemans, Walter",
editor = "Chen, Canyu and
Zhang, Yuji and
Li, Zoey Sha and
Wang, Zihan and
Wang, Qineng and
Su, Jinyan and
Kargupta, Priyanka and
Marjanovi{\'c}, Sara Vera and
Pan, Jeff Z. and
Bansal, Mohit and
Augenstein, Isabelle and
Han, Jiawei and
Ji, Heng and
Li, Manling",
booktitle = "Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models ({K}now{FM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.knowfm-1.4/",
pages = "34--59",
ISBN = "979-8-89176-403-3",
abstract = "Retrieval-augmented generation (RAG) systems can play an important role in making law more accessible. However, large and reliable resources for training and benchmarking such systems remain scarce, especially for under-resourced languages like Dutch. To address this gap, and building on previous work (Louis et al., 2024), we introduce bLLeQA, a bilingual parallel question-answering dataset grounded in Belgian legal resources, both in French and Dutch. The dataset contains aligned questions, answers, and supporting articles in both languages, enabling evaluation of both retrieval and end-to-end RAG pipelines. Using bLLeQA, we benchmark the full RAG pipeline in a zero-shot setting, covering retrieval, citation extraction, refusal behavior, and generation quality. Our experiments show that open-weight models are competitive with proprietary models in retrieval and citation extraction, but lag behind in generation quality in the RAG pipeline. Across all models, refusal capability remains weak, meaning that models do not reliably detect when the provided supporting sources are incomplete. In addition, the end-to-end RAG setup still yields a substantial share of flawed responses, reaching 20{\%} even in the best-case scenario."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="banar-etal-2026-blleqa">
<titleInfo>
<title>bLLeQA: Benchmarking LLMs for Grounded Legal Question-Answering in French and Dutch</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Banar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Lotfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">Van Nooten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marija</namePart>
<namePart type="family">Kliocaite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models (KnowFM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Canyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuji</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoey</namePart>
<namePart type="given">Sha</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qineng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyan</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priyanka</namePart>
<namePart type="family">Kargupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="given">Vera</namePart>
<namePart type="family">Marjanović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeff</namePart>
<namePart type="given">Z</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiawei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-403-3</identifier>
</relatedItem>
<abstract>Retrieval-augmented generation (RAG) systems can play an important role in making law more accessible. However, large and reliable resources for training and benchmarking such systems remain scarce, especially for under-resourced languages like Dutch. To address this gap, and building on previous work (Louis et al., 2024), we introduce bLLeQA, a bilingual parallel question-answering dataset grounded in Belgian legal resources, both in French and Dutch. The dataset contains aligned questions, answers, and supporting articles in both languages, enabling evaluation of both retrieval and end-to-end RAG pipelines. Using bLLeQA, we benchmark the full RAG pipeline in a zero-shot setting, covering retrieval, citation extraction, refusal behavior, and generation quality. Our experiments show that open-weight models are competitive with proprietary models in retrieval and citation extraction, but lag behind in generation quality in the RAG pipeline. Across all models, refusal capability remains weak, meaning that models do not reliably detect when the provided supporting sources are incomplete. In addition, the end-to-end RAG setup still yields a substantial share of flawed responses, reaching 20% even in the best-case scenario.</abstract>
<identifier type="citekey">banar-etal-2026-blleqa</identifier>
<location>
<url>https://aclanthology.org/2026.knowfm-1.4/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>34</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T bLLeQA: Benchmarking LLMs for Grounded Legal Question-Answering in French and Dutch
%A Banar, Nikolay
%A Lotfi, Ehsan
%A Van Nooten, Jens
%A Kliocaite, Marija
%A Daelemans, Walter
%Y Chen, Canyu
%Y Zhang, Yuji
%Y Li, Zoey Sha
%Y Wang, Zihan
%Y Wang, Qineng
%Y Su, Jinyan
%Y Kargupta, Priyanka
%Y Marjanović, Sara Vera
%Y Pan, Jeff Z.
%Y Bansal, Mohit
%Y Augenstein, Isabelle
%Y Han, Jiawei
%Y Ji, Heng
%Y Li, Manling
%S Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models (KnowFM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-403-3
%F banar-etal-2026-blleqa
%X Retrieval-augmented generation (RAG) systems can play an important role in making law more accessible. However, large and reliable resources for training and benchmarking such systems remain scarce, especially for under-resourced languages like Dutch. To address this gap, and building on previous work (Louis et al., 2024), we introduce bLLeQA, a bilingual parallel question-answering dataset grounded in Belgian legal resources, both in French and Dutch. The dataset contains aligned questions, answers, and supporting articles in both languages, enabling evaluation of both retrieval and end-to-end RAG pipelines. Using bLLeQA, we benchmark the full RAG pipeline in a zero-shot setting, covering retrieval, citation extraction, refusal behavior, and generation quality. Our experiments show that open-weight models are competitive with proprietary models in retrieval and citation extraction, but lag behind in generation quality in the RAG pipeline. Across all models, refusal capability remains weak, meaning that models do not reliably detect when the provided supporting sources are incomplete. In addition, the end-to-end RAG setup still yields a substantial share of flawed responses, reaching 20% even in the best-case scenario.
%U https://aclanthology.org/2026.knowfm-1.4/
%P 34-59
Markdown (Informal)
[bLLeQA: Benchmarking LLMs for Grounded Legal Question-Answering in French and Dutch](https://aclanthology.org/2026.knowfm-1.4/) (Banar et al., KnowFM 2026)
ACL