@inproceedings{xu-etal-2025-diagnosing,
title = "Diagnosing Failures in Large Language Models' Answers: Integrating Error Attribution into Evaluation Framework",
author = "Xu, Zishan and
Xie, Shuyi and
Lv, Qingsong and
Xiao, Shupei and
Song, Linlin and
Wenjuan, Sui and
Lin, Fan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1089/",
doi = "10.18653/v1/2025.findings-acl.1089",
pages = "21148--21165",
ISBN = "979-8-89176-256-5",
abstract = "With the widespread application of Large Language Models (LLMs) in various tasks, the mainstream LLM platforms generate massive user-model interactions daily. In order to efficiently analyze the performance of models and diagnose failures in their answers, it is essential to develop an automated framework to systematically categorize and attribute errors. However, existing evaluation models lack error attribution capability. In this work, we establish a comprehensive Misattribution Framework with 6 primary and 15 secondary categories to facilitate in-depth analysis. Based on this framework, we present AttriData, a dataset specifically designed for error attribution, encompassing misattribution, along with the corresponding scores and feedback. We also propose MisAttributionLLM, a fine-tuned model on AttriData, which is the first general-purpose judge model capable of simultaneously generating score, misattribution, and feedback. Extensive experiments and analyses are conducted to confirm the effectiveness and robustness of our proposed method."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2025-diagnosing">
<titleInfo>
<title>Diagnosing Failures in Large Language Models’ Answers: Integrating Error Attribution into Evaluation Framework</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zishan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuyi</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingsong</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shupei</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linlin</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sui</namePart>
<namePart type="family">Wenjuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fan</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>With the widespread application of Large Language Models (LLMs) in various tasks, the mainstream LLM platforms generate massive user-model interactions daily. In order to efficiently analyze the performance of models and diagnose failures in their answers, it is essential to develop an automated framework to systematically categorize and attribute errors. However, existing evaluation models lack error attribution capability. In this work, we establish a comprehensive Misattribution Framework with 6 primary and 15 secondary categories to facilitate in-depth analysis. Based on this framework, we present AttriData, a dataset specifically designed for error attribution, encompassing misattribution, along with the corresponding scores and feedback. We also propose MisAttributionLLM, a fine-tuned model on AttriData, which is the first general-purpose judge model capable of simultaneously generating score, misattribution, and feedback. Extensive experiments and analyses are conducted to confirm the effectiveness and robustness of our proposed method.</abstract>
<identifier type="citekey">xu-etal-2025-diagnosing</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1089</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1089/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>21148</start>
<end>21165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Diagnosing Failures in Large Language Models’ Answers: Integrating Error Attribution into Evaluation Framework
%A Xu, Zishan
%A Xie, Shuyi
%A Lv, Qingsong
%A Xiao, Shupei
%A Song, Linlin
%A Wenjuan, Sui
%A Lin, Fan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F xu-etal-2025-diagnosing
%X With the widespread application of Large Language Models (LLMs) in various tasks, the mainstream LLM platforms generate massive user-model interactions daily. In order to efficiently analyze the performance of models and diagnose failures in their answers, it is essential to develop an automated framework to systematically categorize and attribute errors. However, existing evaluation models lack error attribution capability. In this work, we establish a comprehensive Misattribution Framework with 6 primary and 15 secondary categories to facilitate in-depth analysis. Based on this framework, we present AttriData, a dataset specifically designed for error attribution, encompassing misattribution, along with the corresponding scores and feedback. We also propose MisAttributionLLM, a fine-tuned model on AttriData, which is the first general-purpose judge model capable of simultaneously generating score, misattribution, and feedback. Extensive experiments and analyses are conducted to confirm the effectiveness and robustness of our proposed method.
%R 10.18653/v1/2025.findings-acl.1089
%U https://aclanthology.org/2025.findings-acl.1089/
%U https://doi.org/10.18653/v1/2025.findings-acl.1089
%P 21148-21165
Markdown (Informal)
[Diagnosing Failures in Large Language Models’ Answers: Integrating Error Attribution into Evaluation Framework](https://aclanthology.org/2025.findings-acl.1089/) (Xu et al., Findings 2025)
ACL