@inproceedings{ou-etal-2025-agentdiagnose,
title = "{A}gent{D}iagnose: An Open Toolkit for Diagnosing {LLM} Agent Trajectories",
author = "Ou, Tianyue and
Guo, Wanyao and
Gandhi, Apurva and
Neubig, Graham and
Yue, Xiang",
editor = {Habernal, Ivan and
Schulam, Peter and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-demos.15/",
pages = "207--215",
ISBN = "979-8-89176-334-0",
abstract = "Large Language Model (LLM) agents produce rich, multi-step trajectories that interleave observations, internal reasoning, and tool actions. However, most evaluation pipelines focus solely on end-task success, leaving the agent{'}s decision-making process opaque and poorly understood. We introduce AgentDiagnose, an open-source, modular framework for diagnosing agent trajectories. The present release fully supports the web domain, and AgentDiagnose is architect as an extensible, open platform with compatibility for most agent trajectories. AgentDiagnose consists of (i) an evaluation module that quantifies five core agentic competencies{---}backtracking {\&} exploration, task decomposition, observation reading, self-verification, and objective quality{---}and (ii) a visualization module that highlights trajectory semantics through t-SNE action embeddings, interactive word clouds, and state-transition timelines. On a set of 30 manually annotated trajectories, our automatic metrics achieve a mean Pearson correlation of 0.57 with human judgments, rising to 0.78 for task decomposition. Furthermore, filtering the 46k-example NNetNav-Live dataset with AgentDiagnose and fine-tuning a Llama-3.1-8B model on the top 6k trajectories improves WebArena success rates by 0.98, despite using only 13{\%} of the original data. AgentDiagnose thus serves as both a diagnostic lens for agent analysis and a practical tool for curating higher-quality training data. The toolkit and demo are publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ou-etal-2025-agentdiagnose">
<titleInfo>
<title>AgentDiagnose: An Open Toolkit for Diagnosing LLM Agent Trajectories</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyue</namePart>
<namePart type="family">Ou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wanyao</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apurva</namePart>
<namePart type="family">Gandhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Neubig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Yue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Schulam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-334-0</identifier>
</relatedItem>
<abstract>Large Language Model (LLM) agents produce rich, multi-step trajectories that interleave observations, internal reasoning, and tool actions. However, most evaluation pipelines focus solely on end-task success, leaving the agent’s decision-making process opaque and poorly understood. We introduce AgentDiagnose, an open-source, modular framework for diagnosing agent trajectories. The present release fully supports the web domain, and AgentDiagnose is architect as an extensible, open platform with compatibility for most agent trajectories. AgentDiagnose consists of (i) an evaluation module that quantifies five core agentic competencies—backtracking & exploration, task decomposition, observation reading, self-verification, and objective quality—and (ii) a visualization module that highlights trajectory semantics through t-SNE action embeddings, interactive word clouds, and state-transition timelines. On a set of 30 manually annotated trajectories, our automatic metrics achieve a mean Pearson correlation of 0.57 with human judgments, rising to 0.78 for task decomposition. Furthermore, filtering the 46k-example NNetNav-Live dataset with AgentDiagnose and fine-tuning a Llama-3.1-8B model on the top 6k trajectories improves WebArena success rates by 0.98, despite using only 13% of the original data. AgentDiagnose thus serves as both a diagnostic lens for agent analysis and a practical tool for curating higher-quality training data. The toolkit and demo are publicly available.</abstract>
<identifier type="citekey">ou-etal-2025-agentdiagnose</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-demos.15/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>207</start>
<end>215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AgentDiagnose: An Open Toolkit for Diagnosing LLM Agent Trajectories
%A Ou, Tianyue
%A Guo, Wanyao
%A Gandhi, Apurva
%A Neubig, Graham
%A Yue, Xiang
%Y Habernal, Ivan
%Y Schulam, Peter
%Y Tiedemann, Jörg
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-334-0
%F ou-etal-2025-agentdiagnose
%X Large Language Model (LLM) agents produce rich, multi-step trajectories that interleave observations, internal reasoning, and tool actions. However, most evaluation pipelines focus solely on end-task success, leaving the agent’s decision-making process opaque and poorly understood. We introduce AgentDiagnose, an open-source, modular framework for diagnosing agent trajectories. The present release fully supports the web domain, and AgentDiagnose is architect as an extensible, open platform with compatibility for most agent trajectories. AgentDiagnose consists of (i) an evaluation module that quantifies five core agentic competencies—backtracking & exploration, task decomposition, observation reading, self-verification, and objective quality—and (ii) a visualization module that highlights trajectory semantics through t-SNE action embeddings, interactive word clouds, and state-transition timelines. On a set of 30 manually annotated trajectories, our automatic metrics achieve a mean Pearson correlation of 0.57 with human judgments, rising to 0.78 for task decomposition. Furthermore, filtering the 46k-example NNetNav-Live dataset with AgentDiagnose and fine-tuning a Llama-3.1-8B model on the top 6k trajectories improves WebArena success rates by 0.98, despite using only 13% of the original data. AgentDiagnose thus serves as both a diagnostic lens for agent analysis and a practical tool for curating higher-quality training data. The toolkit and demo are publicly available.
%U https://aclanthology.org/2025.emnlp-demos.15/
%P 207-215
Markdown (Informal)
[AgentDiagnose: An Open Toolkit for Diagnosing LLM Agent Trajectories](https://aclanthology.org/2025.emnlp-demos.15/) (Ou et al., EMNLP 2025)
ACL