@inproceedings{ahmadi-etal-2026-lemur,
title = "{LEMUR}: Robust Fine-Tuning for Multilingual Embedding Models for Retrieval",
author = "Ahmadi, Narges Baba and
Strich, Jan and
Semmann, Martin and
Biemann, Chris",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.18/",
pages = "248--265",
ISBN = "979-8-89176-383-8",
abstract = "Large language models (LLMs) are increasingly used to access legal information. Yet, their deployment in multilingual legal settings is constrained by unreliable retrieval and the lack of domain-adapted, open-embedding models. In particular, existing multilingual legal corpora are not designed for semantic retrieval, and PDF-based legislative sources introduce substantial noise due to imperfect text extraction. To address these challenges, we introduce LEMUR, a large-scale multilingual corpus of EU environmental legislation constructed from 24,953 official EUR-Lex PDF documents covering 25 languages. We further propose the Lexical Content Score (LCS), a language-agnostic metric that quantifies the fidelity of PDF-to-text conversion by measuring lexical consistency against authoritative HTML versions. Building on LEMUR, we fine-tune three state-of-the-art multilingual embedding models using contrastive objectives in both monolingual and bilingual settings, reflecting realistic legal-retrieval scenarios. Experiments across low- and high-resource languages demonstrate that legal-domain fine-tuning consistently improves Top-k retrieval accuracy relative to strong baselines, with particularly pronounced gains for low-resource languages. Cross-lingual evaluations show that these improvements transfer to unseen languages, indicating that fine-tuning primarily enhances language-independent, content-level legal representations rather than language-specific cues. We publish code[GitHub Repository] and data[Hugging Face Dataset]."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmadi-etal-2026-lemur">
<titleInfo>
<title>LEMUR: Robust Fine-Tuning for Multilingual Embedding Models for Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Narges</namePart>
<namePart type="given">Baba</namePart>
<namePart type="family">Ahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Strich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Semmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Biemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are increasingly used to access legal information. Yet, their deployment in multilingual legal settings is constrained by unreliable retrieval and the lack of domain-adapted, open-embedding models. In particular, existing multilingual legal corpora are not designed for semantic retrieval, and PDF-based legislative sources introduce substantial noise due to imperfect text extraction. To address these challenges, we introduce LEMUR, a large-scale multilingual corpus of EU environmental legislation constructed from 24,953 official EUR-Lex PDF documents covering 25 languages. We further propose the Lexical Content Score (LCS), a language-agnostic metric that quantifies the fidelity of PDF-to-text conversion by measuring lexical consistency against authoritative HTML versions. Building on LEMUR, we fine-tune three state-of-the-art multilingual embedding models using contrastive objectives in both monolingual and bilingual settings, reflecting realistic legal-retrieval scenarios. Experiments across low- and high-resource languages demonstrate that legal-domain fine-tuning consistently improves Top-k retrieval accuracy relative to strong baselines, with particularly pronounced gains for low-resource languages. Cross-lingual evaluations show that these improvements transfer to unseen languages, indicating that fine-tuning primarily enhances language-independent, content-level legal representations rather than language-specific cues. We publish code[GitHub Repository] and data[Hugging Face Dataset].</abstract>
<identifier type="citekey">ahmadi-etal-2026-lemur</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.18/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>248</start>
<end>265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LEMUR: Robust Fine-Tuning for Multilingual Embedding Models for Retrieval
%A Ahmadi, Narges Baba
%A Strich, Jan
%A Semmann, Martin
%A Biemann, Chris
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F ahmadi-etal-2026-lemur
%X Large language models (LLMs) are increasingly used to access legal information. Yet, their deployment in multilingual legal settings is constrained by unreliable retrieval and the lack of domain-adapted, open-embedding models. In particular, existing multilingual legal corpora are not designed for semantic retrieval, and PDF-based legislative sources introduce substantial noise due to imperfect text extraction. To address these challenges, we introduce LEMUR, a large-scale multilingual corpus of EU environmental legislation constructed from 24,953 official EUR-Lex PDF documents covering 25 languages. We further propose the Lexical Content Score (LCS), a language-agnostic metric that quantifies the fidelity of PDF-to-text conversion by measuring lexical consistency against authoritative HTML versions. Building on LEMUR, we fine-tune three state-of-the-art multilingual embedding models using contrastive objectives in both monolingual and bilingual settings, reflecting realistic legal-retrieval scenarios. Experiments across low- and high-resource languages demonstrate that legal-domain fine-tuning consistently improves Top-k retrieval accuracy relative to strong baselines, with particularly pronounced gains for low-resource languages. Cross-lingual evaluations show that these improvements transfer to unseen languages, indicating that fine-tuning primarily enhances language-independent, content-level legal representations rather than language-specific cues. We publish code[GitHub Repository] and data[Hugging Face Dataset].
%U https://aclanthology.org/2026.eacl-srw.18/
%P 248-265
Markdown (Informal)
[LEMUR: Robust Fine-Tuning for Multilingual Embedding Models for Retrieval](https://aclanthology.org/2026.eacl-srw.18/) (Ahmadi et al., EACL 2026)
ACL
- Narges Baba Ahmadi, Jan Strich, Martin Semmann, and Chris Biemann. 2026. LEMUR: Robust Fine-Tuning for Multilingual Embedding Models for Retrieval. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop), pages 248–265, Rabat, Morocco. Association for Computational Linguistics.