@inproceedings{graciotti-etal-2025-ke,
title = "{KE}-{MHISTO}: Towards a Multilingual Historical Knowledge Extraction Benchmark for Addressing the Long-Tail Problem",
author = "Graciotti, Arianna and
Piano, Leonardo and
Lazzari, Nicolas and
Daga, Enrico and
Tripodi, Rocco and
Presutti, Valentina and
Pompianu, Livio",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1042/",
doi = "10.18653/v1/2025.findings-acl.1042",
pages = "20316--20339",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) face significant challenges when queried about long-tail knowledge, i.e., information that is rarely encountered during their training process. These difficulties arise due to the inherent sparsity of such data. Furthermore, LLMs often lack the ability to verify or ground their responses in authoritative sources, which can lead to plausible yet inaccurate outputs when addressing infrequent subject matter. Our work aims to investigate these phenomena by introducing KE-MHISTO, a multilingual benchmark for Entity Linking and Question Answering in the domain of historical music knowledge, available in both Italian and English. We demonstrate that KE-MHISTO provides significantly broader coverage of long-tail knowledge compared to existing alternatives. Moreover, it poses substantial challenges for state-of-the-art models. Our experiments reveal that smaller, multilingual models can achieve performance comparable to significantly larger counterparts, highlighting the potential of efficient, language-aware approaches for long-tail knowledge extraction. KE-MHISTO is available at: https://github.com/polifonia-project/KE-MHISTO."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="graciotti-etal-2025-ke">
<titleInfo>
<title>KE-MHISTO: Towards a Multilingual Historical Knowledge Extraction Benchmark for Addressing the Long-Tail Problem</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Graciotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Piano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Lazzari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Daga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rocco</namePart>
<namePart type="family">Tripodi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentina</namePart>
<namePart type="family">Presutti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Livio</namePart>
<namePart type="family">Pompianu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) face significant challenges when queried about long-tail knowledge, i.e., information that is rarely encountered during their training process. These difficulties arise due to the inherent sparsity of such data. Furthermore, LLMs often lack the ability to verify or ground their responses in authoritative sources, which can lead to plausible yet inaccurate outputs when addressing infrequent subject matter. Our work aims to investigate these phenomena by introducing KE-MHISTO, a multilingual benchmark for Entity Linking and Question Answering in the domain of historical music knowledge, available in both Italian and English. We demonstrate that KE-MHISTO provides significantly broader coverage of long-tail knowledge compared to existing alternatives. Moreover, it poses substantial challenges for state-of-the-art models. Our experiments reveal that smaller, multilingual models can achieve performance comparable to significantly larger counterparts, highlighting the potential of efficient, language-aware approaches for long-tail knowledge extraction. KE-MHISTO is available at: https://github.com/polifonia-project/KE-MHISTO.</abstract>
<identifier type="citekey">graciotti-etal-2025-ke</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1042</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1042/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>20316</start>
<end>20339</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KE-MHISTO: Towards a Multilingual Historical Knowledge Extraction Benchmark for Addressing the Long-Tail Problem
%A Graciotti, Arianna
%A Piano, Leonardo
%A Lazzari, Nicolas
%A Daga, Enrico
%A Tripodi, Rocco
%A Presutti, Valentina
%A Pompianu, Livio
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F graciotti-etal-2025-ke
%X Large Language Models (LLMs) face significant challenges when queried about long-tail knowledge, i.e., information that is rarely encountered during their training process. These difficulties arise due to the inherent sparsity of such data. Furthermore, LLMs often lack the ability to verify or ground their responses in authoritative sources, which can lead to plausible yet inaccurate outputs when addressing infrequent subject matter. Our work aims to investigate these phenomena by introducing KE-MHISTO, a multilingual benchmark for Entity Linking and Question Answering in the domain of historical music knowledge, available in both Italian and English. We demonstrate that KE-MHISTO provides significantly broader coverage of long-tail knowledge compared to existing alternatives. Moreover, it poses substantial challenges for state-of-the-art models. Our experiments reveal that smaller, multilingual models can achieve performance comparable to significantly larger counterparts, highlighting the potential of efficient, language-aware approaches for long-tail knowledge extraction. KE-MHISTO is available at: https://github.com/polifonia-project/KE-MHISTO.
%R 10.18653/v1/2025.findings-acl.1042
%U https://aclanthology.org/2025.findings-acl.1042/
%U https://doi.org/10.18653/v1/2025.findings-acl.1042
%P 20316-20339
Markdown (Informal)
[KE-MHISTO: Towards a Multilingual Historical Knowledge Extraction Benchmark for Addressing the Long-Tail Problem](https://aclanthology.org/2025.findings-acl.1042/) (Graciotti et al., Findings 2025)
ACL