@inproceedings{tang-etal-2025-l,
title = "$\textit{L-CiteEval}$: A Suite for Evaluating Fidelity of Long-context Models",
author = "Tang, Zecheng and
Zhou, Keyan and
Li, Juntao and
Ji, Baibei and
Hou, Jianye and
Zhang, Min",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.263/",
doi = "10.18653/v1/2025.acl-long.263",
pages = "5254--5277",
ISBN = "979-8-89176-251-0",
abstract = "Long-context models(LCMs) have witnessed remarkable advancements in recent years, facilitating real-world tasks like long-document QA. The success of LCMs is founded on the hypothesis that the model demonstrates strong $\textbf{fidelity}$, enabling it to respond based on the provided long context rather than relying solely on the intrinsic knowledge acquired during pre-training. Yet, in this paper, we find that open-sourced LCMs are not as faithful as expected. We introduce $\textit{L-CiteEval}$, an out-of-the-box suite that can assess both generation quality and fidelity in long-context understanding tasks. It covers 11 tasks with context lengths ranging from 8K to 48K and a corresponding automatic evaluation pipeline. Evaluation of 11 cutting-edge closed-source and open-source LCMs indicates that, while there are minor differences in their generation, open-source models significantly lag behind closed-source counterparts in terms of fidelity. Furthermore, we analyze the benefits of citation generation for LCMs from both the perspective of explicit model output and the internal attention mechanism."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tang-etal-2025-l">
<titleInfo>
<title>L-CiteEval: A Suite for Evaluating Fidelity of Long-context Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zecheng</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keyan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juntao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baibei</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianye</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Long-context models(LCMs) have witnessed remarkable advancements in recent years, facilitating real-world tasks like long-document QA. The success of LCMs is founded on the hypothesis that the model demonstrates strong fidelity, enabling it to respond based on the provided long context rather than relying solely on the intrinsic knowledge acquired during pre-training. Yet, in this paper, we find that open-sourced LCMs are not as faithful as expected. We introduce L-CiteEval, an out-of-the-box suite that can assess both generation quality and fidelity in long-context understanding tasks. It covers 11 tasks with context lengths ranging from 8K to 48K and a corresponding automatic evaluation pipeline. Evaluation of 11 cutting-edge closed-source and open-source LCMs indicates that, while there are minor differences in their generation, open-source models significantly lag behind closed-source counterparts in terms of fidelity. Furthermore, we analyze the benefits of citation generation for LCMs from both the perspective of explicit model output and the internal attention mechanism.</abstract>
<identifier type="citekey">tang-etal-2025-l</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.263</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.263/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5254</start>
<end>5277</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T L-CiteEval: A Suite for Evaluating Fidelity of Long-context Models
%A Tang, Zecheng
%A Zhou, Keyan
%A Li, Juntao
%A Ji, Baibei
%A Hou, Jianye
%A Zhang, Min
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F tang-etal-2025-l
%X Long-context models(LCMs) have witnessed remarkable advancements in recent years, facilitating real-world tasks like long-document QA. The success of LCMs is founded on the hypothesis that the model demonstrates strong fidelity, enabling it to respond based on the provided long context rather than relying solely on the intrinsic knowledge acquired during pre-training. Yet, in this paper, we find that open-sourced LCMs are not as faithful as expected. We introduce L-CiteEval, an out-of-the-box suite that can assess both generation quality and fidelity in long-context understanding tasks. It covers 11 tasks with context lengths ranging from 8K to 48K and a corresponding automatic evaluation pipeline. Evaluation of 11 cutting-edge closed-source and open-source LCMs indicates that, while there are minor differences in their generation, open-source models significantly lag behind closed-source counterparts in terms of fidelity. Furthermore, we analyze the benefits of citation generation for LCMs from both the perspective of explicit model output and the internal attention mechanism.
%R 10.18653/v1/2025.acl-long.263
%U https://aclanthology.org/2025.acl-long.263/
%U https://doi.org/10.18653/v1/2025.acl-long.263
%P 5254-5277
Markdown (Informal)
[L-CiteEval: A Suite for Evaluating Fidelity of Long-context Models](https://aclanthology.org/2025.acl-long.263/) (Tang et al., ACL 2025)
ACL
- Zecheng Tang, Keyan Zhou, Juntao Li, Baibei Ji, Jianye Hou, and Min Zhang. 2025. L-CiteEval: A Suite for Evaluating Fidelity of Long-context Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5254–5277, Vienna, Austria. Association for Computational Linguistics.