@inproceedings{wei-etal-2025-mlake,
title = "{ML}a{KE}: Multilingual Knowledge Editing Benchmark for Large Language Models",
author = "Wei, Zihao and
Deng, Jingcheng and
Pang, Liang and
Ding, Hanxing and
Shen, Huawei and
Cheng, Xueqi",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.301/",
pages = "4457--4473",
abstract = "The extensive utilization of large language models (LLMs) underscores the crucial necessity for precise and contemporary knowledge embedded within their intrinsic parameters. Existing research on knowledge editing primarily concentrates on monolingual scenarios, neglecting the complexities presented by multilingual contexts and multi-hop reasoning. To address these challenges, our study introduces MLaKE (Multilingual Language Knowledge Editing), a novel benchmark comprising 4072 multi-hop and 5360 single-hop questions designed to evaluate the adaptability of knowledge editing methods across five languages: English, Chinese, Japanese, French, and German. MLaKE aggregates fact chains from Wikipedia across languages and utilizes LLMs to generate questions and answer. We assessed the effectiveness of current multilingual knowledge editing methods using the MLaKE dataset. Our results show that due to considerable inconsistencies in both multilingual performance and encoding efficiency, these methods struggle to generalize effectively across languages. The accuracy of these methods when editing English is notably higher than for other languages. The experimental results further demonstrate that models encode knowledge and generation capabilities for different languages using distinct parameters, leading to poor cross-lingual transfer performance in current methods. Transfer performance is notably better within the same language family compared to across different families. These findings emphasize the urgent need to improve multilingual knowledge editing methods."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wei-etal-2025-mlake">
<titleInfo>
<title>MLaKE: Multilingual Knowledge Editing Benchmark for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihao</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingcheng</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanxing</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huawei</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueqi</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The extensive utilization of large language models (LLMs) underscores the crucial necessity for precise and contemporary knowledge embedded within their intrinsic parameters. Existing research on knowledge editing primarily concentrates on monolingual scenarios, neglecting the complexities presented by multilingual contexts and multi-hop reasoning. To address these challenges, our study introduces MLaKE (Multilingual Language Knowledge Editing), a novel benchmark comprising 4072 multi-hop and 5360 single-hop questions designed to evaluate the adaptability of knowledge editing methods across five languages: English, Chinese, Japanese, French, and German. MLaKE aggregates fact chains from Wikipedia across languages and utilizes LLMs to generate questions and answer. We assessed the effectiveness of current multilingual knowledge editing methods using the MLaKE dataset. Our results show that due to considerable inconsistencies in both multilingual performance and encoding efficiency, these methods struggle to generalize effectively across languages. The accuracy of these methods when editing English is notably higher than for other languages. The experimental results further demonstrate that models encode knowledge and generation capabilities for different languages using distinct parameters, leading to poor cross-lingual transfer performance in current methods. Transfer performance is notably better within the same language family compared to across different families. These findings emphasize the urgent need to improve multilingual knowledge editing methods.</abstract>
<identifier type="citekey">wei-etal-2025-mlake</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.301/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4457</start>
<end>4473</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MLaKE: Multilingual Knowledge Editing Benchmark for Large Language Models
%A Wei, Zihao
%A Deng, Jingcheng
%A Pang, Liang
%A Ding, Hanxing
%A Shen, Huawei
%A Cheng, Xueqi
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F wei-etal-2025-mlake
%X The extensive utilization of large language models (LLMs) underscores the crucial necessity for precise and contemporary knowledge embedded within their intrinsic parameters. Existing research on knowledge editing primarily concentrates on monolingual scenarios, neglecting the complexities presented by multilingual contexts and multi-hop reasoning. To address these challenges, our study introduces MLaKE (Multilingual Language Knowledge Editing), a novel benchmark comprising 4072 multi-hop and 5360 single-hop questions designed to evaluate the adaptability of knowledge editing methods across five languages: English, Chinese, Japanese, French, and German. MLaKE aggregates fact chains from Wikipedia across languages and utilizes LLMs to generate questions and answer. We assessed the effectiveness of current multilingual knowledge editing methods using the MLaKE dataset. Our results show that due to considerable inconsistencies in both multilingual performance and encoding efficiency, these methods struggle to generalize effectively across languages. The accuracy of these methods when editing English is notably higher than for other languages. The experimental results further demonstrate that models encode knowledge and generation capabilities for different languages using distinct parameters, leading to poor cross-lingual transfer performance in current methods. Transfer performance is notably better within the same language family compared to across different families. These findings emphasize the urgent need to improve multilingual knowledge editing methods.
%U https://aclanthology.org/2025.coling-main.301/
%P 4457-4473
Markdown (Informal)
[MLaKE: Multilingual Knowledge Editing Benchmark for Large Language Models](https://aclanthology.org/2025.coling-main.301/) (Wei et al., COLING 2025)
ACL