@inproceedings{lu-etal-2025-piper,
title = "{PIPER}: Benchmarking and Prompting Event Reasoning Boundary of {LLM}s via Debiasing-Distillation Enhanced Tuning",
author = "Lu, Zhicong and
Tian, Changyuan and
Li, Peiguang and
Jin, Li and
Wang, Sirui and
Jia, Wei and
Shen, Ying and
Xu, Guangluan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1389/",
doi = "10.18653/v1/2025.acl-long.1389",
pages = "28591--28613",
ISBN = "979-8-89176-251-0",
abstract = "While Large Language Models (LLMs) excel in diverse domains, their validity in event reasoning remains underexplored. Most existing works merely stagnate at assessing LLMs' event reasoning with a single event relational type or reasoning format, failing to conduct a complete evaluation and provide a practical solution for capability enhancement. In this paper, we propose $\textbf{PIPER}$, the first comprehensive benchmark for $\textbf{P}$robing $\textbf{I}$nto the $\textbf{P}$erformance boundary of LLMs in $\textbf{E}$vent $\textbf{R}$easoning. Motivated by our evaluation observations and error patterns analysis, we meticulously craft 10K diverse instruction-tuning demonstrations to alleviate event reasoning-oriented data scarcity. Additionally, a novel $\textbf{D}$ebiasing and $\textbf{D}$istillation-$\textbf{E}$nhanced $\textbf{S}$upervised $\textbf{F}$ine-$\textbf{T}$uning ($\mathbf{D^2}$$\textbf{E-SFT}$) strategy is presented, which facilitates adhering to context and fixating significant contextual event information to elevate the event reasoning capability. Specifically, $\mathrm{D^2}$E-SFT removes the given sample{'}s context to construct an imagined sample, subtracting its logits to mitigate the bias of neglecting context and improve contextual faithfulness. To guide the model in emphasizing significant contextual event information, $\mathrm{D^2}$E-SFT employs a context-refined sample to achieve self-distillation with the alignment of logits. Extensive experimental results demonstrate the effectiveness of our data and strategy in expanding the performance boundary of event reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lu-etal-2025-piper">
<titleInfo>
<title>PIPER: Benchmarking and Prompting Event Reasoning Boundary of LLMs via Debiasing-Distillation Enhanced Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhicong</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changyuan</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peiguang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sirui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guangluan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>While Large Language Models (LLMs) excel in diverse domains, their validity in event reasoning remains underexplored. Most existing works merely stagnate at assessing LLMs’ event reasoning with a single event relational type or reasoning format, failing to conduct a complete evaluation and provide a practical solution for capability enhancement. In this paper, we propose PIPER, the first comprehensive benchmark for Probing Into the Performance boundary of LLMs in Event Reasoning. Motivated by our evaluation observations and error patterns analysis, we meticulously craft 10K diverse instruction-tuning demonstrations to alleviate event reasoning-oriented data scarcity. Additionally, a novel Debiasing and Distillation-Enhanced Supervised Fine-Tuning (\mathbfD²E-SFT) strategy is presented, which facilitates adhering to context and fixating significant contextual event information to elevate the event reasoning capability. Specifically, D²E-SFT removes the given sample’s context to construct an imagined sample, subtracting its logits to mitigate the bias of neglecting context and improve contextual faithfulness. To guide the model in emphasizing significant contextual event information, D²E-SFT employs a context-refined sample to achieve self-distillation with the alignment of logits. Extensive experimental results demonstrate the effectiveness of our data and strategy in expanding the performance boundary of event reasoning.</abstract>
<identifier type="citekey">lu-etal-2025-piper</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1389</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1389/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>28591</start>
<end>28613</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PIPER: Benchmarking and Prompting Event Reasoning Boundary of LLMs via Debiasing-Distillation Enhanced Tuning
%A Lu, Zhicong
%A Tian, Changyuan
%A Li, Peiguang
%A Jin, Li
%A Wang, Sirui
%A Jia, Wei
%A Shen, Ying
%A Xu, Guangluan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F lu-etal-2025-piper
%X While Large Language Models (LLMs) excel in diverse domains, their validity in event reasoning remains underexplored. Most existing works merely stagnate at assessing LLMs’ event reasoning with a single event relational type or reasoning format, failing to conduct a complete evaluation and provide a practical solution for capability enhancement. In this paper, we propose PIPER, the first comprehensive benchmark for Probing Into the Performance boundary of LLMs in Event Reasoning. Motivated by our evaluation observations and error patterns analysis, we meticulously craft 10K diverse instruction-tuning demonstrations to alleviate event reasoning-oriented data scarcity. Additionally, a novel Debiasing and Distillation-Enhanced Supervised Fine-Tuning (\mathbfD²E-SFT) strategy is presented, which facilitates adhering to context and fixating significant contextual event information to elevate the event reasoning capability. Specifically, D²E-SFT removes the given sample’s context to construct an imagined sample, subtracting its logits to mitigate the bias of neglecting context and improve contextual faithfulness. To guide the model in emphasizing significant contextual event information, D²E-SFT employs a context-refined sample to achieve self-distillation with the alignment of logits. Extensive experimental results demonstrate the effectiveness of our data and strategy in expanding the performance boundary of event reasoning.
%R 10.18653/v1/2025.acl-long.1389
%U https://aclanthology.org/2025.acl-long.1389/
%U https://doi.org/10.18653/v1/2025.acl-long.1389
%P 28591-28613
Markdown (Informal)
[PIPER: Benchmarking and Prompting Event Reasoning Boundary of LLMs via Debiasing-Distillation Enhanced Tuning](https://aclanthology.org/2025.acl-long.1389/) (Lu et al., ACL 2025)
ACL
- Zhicong Lu, Changyuan Tian, Peiguang Li, Li Jin, Sirui Wang, Wei Jia, Ying Shen, and Guangluan Xu. 2025. PIPER: Benchmarking and Prompting Event Reasoning Boundary of LLMs via Debiasing-Distillation Enhanced Tuning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 28591–28613, Vienna, Austria. Association for Computational Linguistics.