@inproceedings{denisova-rychly-2025-evaluating,
title = "Evaluating Bilingual Lexicon Induction without Lexical Data",
author = "Denisov{\'a}, Michaela and
Rychly, Pavel",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.34/",
pages = "275--282",
abstract = "Bilingual Lexicon Induction (BLI) is a fundamental task in cross-lingual word embedding (CWE) evaluation, aimed at retrieving word translations from monolingual corpora in two languages. Despite the task{'}s central role, existing evaluation datasets based on lexical data often contain biases such as a lack of morphological diversity, frequency skew, semantic leakage, and overrepresentation of proper names, which undermine the validity of reported performance. In this paper, we propose a novel, language-agnostic evaluation methodology that entirely eliminates the dependency on lexical data. By training two sets of monolingual word embeddings (MWEs) using identical data and algorithms but with different weight initialisations, we enable the assessment on the BLI task without being affected by the quality of the evaluation dataset. We evaluate three baseline CWE models and analyse the impact of key hyperparameters. Our results provide a more reliable and bias-free perspective on CWE models' performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="denisova-rychly-2025-evaluating">
<titleInfo>
<title>Evaluating Bilingual Lexicon Induction without Lexical Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michaela</namePart>
<namePart type="family">Denisová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Rychly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Bilingual Lexicon Induction (BLI) is a fundamental task in cross-lingual word embedding (CWE) evaluation, aimed at retrieving word translations from monolingual corpora in two languages. Despite the task’s central role, existing evaluation datasets based on lexical data often contain biases such as a lack of morphological diversity, frequency skew, semantic leakage, and overrepresentation of proper names, which undermine the validity of reported performance. In this paper, we propose a novel, language-agnostic evaluation methodology that entirely eliminates the dependency on lexical data. By training two sets of monolingual word embeddings (MWEs) using identical data and algorithms but with different weight initialisations, we enable the assessment on the BLI task without being affected by the quality of the evaluation dataset. We evaluate three baseline CWE models and analyse the impact of key hyperparameters. Our results provide a more reliable and bias-free perspective on CWE models’ performance.</abstract>
<identifier type="citekey">denisova-rychly-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.34/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>275</start>
<end>282</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Bilingual Lexicon Induction without Lexical Data
%A Denisová, Michaela
%A Rychly, Pavel
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F denisova-rychly-2025-evaluating
%X Bilingual Lexicon Induction (BLI) is a fundamental task in cross-lingual word embedding (CWE) evaluation, aimed at retrieving word translations from monolingual corpora in two languages. Despite the task’s central role, existing evaluation datasets based on lexical data often contain biases such as a lack of morphological diversity, frequency skew, semantic leakage, and overrepresentation of proper names, which undermine the validity of reported performance. In this paper, we propose a novel, language-agnostic evaluation methodology that entirely eliminates the dependency on lexical data. By training two sets of monolingual word embeddings (MWEs) using identical data and algorithms but with different weight initialisations, we enable the assessment on the BLI task without being affected by the quality of the evaluation dataset. We evaluate three baseline CWE models and analyse the impact of key hyperparameters. Our results provide a more reliable and bias-free perspective on CWE models’ performance.
%U https://aclanthology.org/2025.ranlp-1.34/
%P 275-282
Markdown (Informal)
[Evaluating Bilingual Lexicon Induction without Lexical Data](https://aclanthology.org/2025.ranlp-1.34/) (Denisová & Rychly, RANLP 2025)
ACL
- Michaela Denisová and Pavel Rychly. 2025. Evaluating Bilingual Lexicon Induction without Lexical Data. In Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era, pages 275–282, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.