@inproceedings{mathai-pierrehumbert-2025-eventhopnli,
title = "{E}vent{H}op{NLI}: A Functional Dataset for Systematically Diagnosing Logical Failures in {LLM} Temporal Reasoning",
author = "Mathai, Ved and
Pierrehumbert, Janet B.",
editor = "Ilinykh, Nikolai and
Appelgren, Mattias and
Lagerstedt, Erik",
booktitle = "Proceedings of the 2025 CLASP Conference on Language models And RePresentations (LARP)",
month = sep,
year = "2025",
address = "Gothenburg, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.clasp-main.2/",
pages = "11--27",
ISBN = "979-8-89176-249-7",
abstract = "This paper presents EventHopNLI, a simplified functional diagnostic dataset for the task of event temporal ordering. This paper uses this diagnostic dataset to improve the interpretability of the performance of attention-based language models on this task. Existing datasets based on natural data have multiple overlapping linguistic features. Simplifying and isolating these features improves interpretability. EventHopNLI is a programmatically-created NLI dataset that systematically varies over various complexity factors such as number of events, number of logical hops etc. Even though EventHopNLI is highly simplified, it still proves challenging to language models. Being functional, the dataset is dynamic. This reduces the risk that the data is available to language models during training. We ablate over the different complexity parameters and illustrate different shortcomings of attention-based models at this task. We discuss the performance of RoBERTa-large, Llama-405B and GPT-4o."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mathai-pierrehumbert-2025-eventhopnli">
<titleInfo>
<title>EventHopNLI: A Functional Dataset for Systematically Diagnosing Logical Failures in LLM Temporal Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ved</namePart>
<namePart type="family">Mathai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janet</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Pierrehumbert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 CLASP Conference on Language models And RePresentations (LARP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mattias</namePart>
<namePart type="family">Appelgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Lagerstedt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gothenburg, Sweden</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-249-7</identifier>
</relatedItem>
<abstract>This paper presents EventHopNLI, a simplified functional diagnostic dataset for the task of event temporal ordering. This paper uses this diagnostic dataset to improve the interpretability of the performance of attention-based language models on this task. Existing datasets based on natural data have multiple overlapping linguistic features. Simplifying and isolating these features improves interpretability. EventHopNLI is a programmatically-created NLI dataset that systematically varies over various complexity factors such as number of events, number of logical hops etc. Even though EventHopNLI is highly simplified, it still proves challenging to language models. Being functional, the dataset is dynamic. This reduces the risk that the data is available to language models during training. We ablate over the different complexity parameters and illustrate different shortcomings of attention-based models at this task. We discuss the performance of RoBERTa-large, Llama-405B and GPT-4o.</abstract>
<identifier type="citekey">mathai-pierrehumbert-2025-eventhopnli</identifier>
<location>
<url>https://aclanthology.org/2025.clasp-main.2/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>11</start>
<end>27</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EventHopNLI: A Functional Dataset for Systematically Diagnosing Logical Failures in LLM Temporal Reasoning
%A Mathai, Ved
%A Pierrehumbert, Janet B.
%Y Ilinykh, Nikolai
%Y Appelgren, Mattias
%Y Lagerstedt, Erik
%S Proceedings of the 2025 CLASP Conference on Language models And RePresentations (LARP)
%D 2025
%8 September
%I Association for Computational Linguistics
%C Gothenburg, Sweden
%@ 979-8-89176-249-7
%F mathai-pierrehumbert-2025-eventhopnli
%X This paper presents EventHopNLI, a simplified functional diagnostic dataset for the task of event temporal ordering. This paper uses this diagnostic dataset to improve the interpretability of the performance of attention-based language models on this task. Existing datasets based on natural data have multiple overlapping linguistic features. Simplifying and isolating these features improves interpretability. EventHopNLI is a programmatically-created NLI dataset that systematically varies over various complexity factors such as number of events, number of logical hops etc. Even though EventHopNLI is highly simplified, it still proves challenging to language models. Being functional, the dataset is dynamic. This reduces the risk that the data is available to language models during training. We ablate over the different complexity parameters and illustrate different shortcomings of attention-based models at this task. We discuss the performance of RoBERTa-large, Llama-405B and GPT-4o.
%U https://aclanthology.org/2025.clasp-main.2/
%P 11-27
Markdown (Informal)
[EventHopNLI: A Functional Dataset for Systematically Diagnosing Logical Failures in LLM Temporal Reasoning](https://aclanthology.org/2025.clasp-main.2/) (Mathai & Pierrehumbert, CLASP 2025)
ACL