@inproceedings{mazzia-etal-2026-benchmarking,
title = "Benchmarking Multilingual Temporal Reasoning in {LLM}s: The Temporal Reasoning Dataset",
author = "Mazzia, Vittorio and
Pollastrini, Sandro and
Bernardi, Davide and
Rubagotti, Chiara and
Amberti, Daniele",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.19/",
pages = "168--181",
abstract = "Time reasoning is a make-or-break capability for Large Language Models ({LLM}s) aspiring to act as reliable personal and enterprise assistants. This work introduces the Temporal Reasoning Dataset ({TRD}), a programmatically generated multilingual benchmark designed to evaluate temporal reasoning operational capabilities in {LLM}s across ten languages, with particular focus on basic operations relevant to conversational agents handling time-sensitive tasks. {TRD} utilizes human-curated carrier phrases to generate a resilient-to-overfitting dataset with diverse samples and controlled difficulty levels across five core task categories, each at five difficulty levels. Extensive experimentation shows consistent patterns in model performance across languages, with a strong linear decline in accuracy as task difficulty rises in reasoning-based tasks, while memorization-based tasks remain stable. Furthermore, reasoning tasks remain robust across temporal shifts, whereas memorization tasks show performance degradation. Additionally, contextual modifications to prompts influence model performance differently than human cognitive patterns."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mazzia-etal-2026-benchmarking">
<titleInfo>
<title>Benchmarking Multilingual Temporal Reasoning in LLMs: The Temporal Reasoning Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vittorio</namePart>
<namePart type="family">Mazzia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pollastrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Davide</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chiara</namePart>
<namePart type="family">Rubagotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniele</namePart>
<namePart type="family">Amberti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Time reasoning is a make-or-break capability for Large Language Models (LLMs) aspiring to act as reliable personal and enterprise assistants. This work introduces the Temporal Reasoning Dataset (TRD), a programmatically generated multilingual benchmark designed to evaluate temporal reasoning operational capabilities in LLMs across ten languages, with particular focus on basic operations relevant to conversational agents handling time-sensitive tasks. TRD utilizes human-curated carrier phrases to generate a resilient-to-overfitting dataset with diverse samples and controlled difficulty levels across five core task categories, each at five difficulty levels. Extensive experimentation shows consistent patterns in model performance across languages, with a strong linear decline in accuracy as task difficulty rises in reasoning-based tasks, while memorization-based tasks remain stable. Furthermore, reasoning tasks remain robust across temporal shifts, whereas memorization tasks show performance degradation. Additionally, contextual modifications to prompts influence model performance differently than human cognitive patterns.</abstract>
<identifier type="citekey">mazzia-etal-2026-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.19/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>168</start>
<end>181</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Multilingual Temporal Reasoning in LLMs: The Temporal Reasoning Dataset
%A Mazzia, Vittorio
%A Pollastrini, Sandro
%A Bernardi, Davide
%A Rubagotti, Chiara
%A Amberti, Daniele
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F mazzia-etal-2026-benchmarking
%X Time reasoning is a make-or-break capability for Large Language Models (LLMs) aspiring to act as reliable personal and enterprise assistants. This work introduces the Temporal Reasoning Dataset (TRD), a programmatically generated multilingual benchmark designed to evaluate temporal reasoning operational capabilities in LLMs across ten languages, with particular focus on basic operations relevant to conversational agents handling time-sensitive tasks. TRD utilizes human-curated carrier phrases to generate a resilient-to-overfitting dataset with diverse samples and controlled difficulty levels across five core task categories, each at five difficulty levels. Extensive experimentation shows consistent patterns in model performance across languages, with a strong linear decline in accuracy as task difficulty rises in reasoning-based tasks, while memorization-based tasks remain stable. Furthermore, reasoning tasks remain robust across temporal shifts, whereas memorization tasks show performance degradation. Additionally, contextual modifications to prompts influence model performance differently than human cognitive patterns.
%U https://aclanthology.org/2026.iwsds-1.19/
%P 168-181
Markdown (Informal)
[Benchmarking Multilingual Temporal Reasoning in LLMs: The Temporal Reasoning Dataset](https://aclanthology.org/2026.iwsds-1.19/) (Mazzia et al., IWSDS 2026)
ACL