@inproceedings{mandal-etal-2026-historybankqa,
title = "{H}istory{B}ank{QA}: Multilingual Temporal Question Answering on Historical Events",
author = "Mandal, Biswadip and
Khandelwal, Anant and
Gupta, Manish",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.starsem-conference.33/",
pages = "474--496",
ISBN = "979-8-89176-413-2",
abstract = "Temporal reasoning over historical events is vital for temporal NLP tasks such as event extraction, entity linking, question answering (QA), timeline summarization, event clustering, and natural language inference. However, benchmarks for evaluating large language models (LLMs) on temporal reasoning remain limited. Existing datasets are small, lack multilingual coverage, and focus on recent events. To address this, we introduce HistoryBank, a multilingual database of 10M+ historical events sourced from Wikipedia timelines and infoboxes. Our database provides unprecedented coverage in both historical depth and linguistic breadth with 10 languages. We also present a comprehensive benchmark covering 6 temporal QA tasks across all languages, evaluating models like LLaMA-3-8B, Mistral-7B, Gemma-2-9B, Qwen3-8B, and GPT4o. GPT-4o consistently performs best; Gemma-2 leads among smaller models. Our work offers a rich resource for advancing multilingual, temporally-aware language understanding of historical events. To support further research, we publicly release our code and datasets. Code available at https://github.com/mandalbiswadip/history-bank and data available at: https://drive.google.com/drive/folders/1vHudioDdI3EeYPbhYjKa0gimxaXvpxB2."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mandal-etal-2026-historybankqa">
<titleInfo>
<title>HistoryBankQA: Multilingual Temporal Question Answering on Historical Events</title>
</titleInfo>
<name type="personal">
<namePart type="given">Biswadip</namePart>
<namePart type="family">Mandal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anant</namePart>
<namePart type="family">Khandelwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manish</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saif</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nedjma</namePart>
<namePart type="family">Ousidhoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-413-2</identifier>
</relatedItem>
<abstract>Temporal reasoning over historical events is vital for temporal NLP tasks such as event extraction, entity linking, question answering (QA), timeline summarization, event clustering, and natural language inference. However, benchmarks for evaluating large language models (LLMs) on temporal reasoning remain limited. Existing datasets are small, lack multilingual coverage, and focus on recent events. To address this, we introduce HistoryBank, a multilingual database of 10M+ historical events sourced from Wikipedia timelines and infoboxes. Our database provides unprecedented coverage in both historical depth and linguistic breadth with 10 languages. We also present a comprehensive benchmark covering 6 temporal QA tasks across all languages, evaluating models like LLaMA-3-8B, Mistral-7B, Gemma-2-9B, Qwen3-8B, and GPT4o. GPT-4o consistently performs best; Gemma-2 leads among smaller models. Our work offers a rich resource for advancing multilingual, temporally-aware language understanding of historical events. To support further research, we publicly release our code and datasets. Code available at https://github.com/mandalbiswadip/history-bank and data available at: https://drive.google.com/drive/folders/1vHudioDdI3EeYPbhYjKa0gimxaXvpxB2.</abstract>
<identifier type="citekey">mandal-etal-2026-historybankqa</identifier>
<location>
<url>https://aclanthology.org/2026.starsem-conference.33/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>474</start>
<end>496</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HistoryBankQA: Multilingual Temporal Question Answering on Historical Events
%A Mandal, Biswadip
%A Khandelwal, Anant
%A Gupta, Manish
%Y Mohammad, Saif M.
%Y Ousidhoum, Nedjma
%S Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-413-2
%F mandal-etal-2026-historybankqa
%X Temporal reasoning over historical events is vital for temporal NLP tasks such as event extraction, entity linking, question answering (QA), timeline summarization, event clustering, and natural language inference. However, benchmarks for evaluating large language models (LLMs) on temporal reasoning remain limited. Existing datasets are small, lack multilingual coverage, and focus on recent events. To address this, we introduce HistoryBank, a multilingual database of 10M+ historical events sourced from Wikipedia timelines and infoboxes. Our database provides unprecedented coverage in both historical depth and linguistic breadth with 10 languages. We also present a comprehensive benchmark covering 6 temporal QA tasks across all languages, evaluating models like LLaMA-3-8B, Mistral-7B, Gemma-2-9B, Qwen3-8B, and GPT4o. GPT-4o consistently performs best; Gemma-2 leads among smaller models. Our work offers a rich resource for advancing multilingual, temporally-aware language understanding of historical events. To support further research, we publicly release our code and datasets. Code available at https://github.com/mandalbiswadip/history-bank and data available at: https://drive.google.com/drive/folders/1vHudioDdI3EeYPbhYjKa0gimxaXvpxB2.
%U https://aclanthology.org/2026.starsem-conference.33/
%P 474-496
Markdown (Informal)
[HistoryBankQA: Multilingual Temporal Question Answering on Historical Events](https://aclanthology.org/2026.starsem-conference.33/) (Mandal et al., *SEM 2026)
ACL