@inproceedings{xu-etal-2026-travel,
title = "Travel on the {ICD} Tree: Benchmarking Agentic Reasoning for {ICD} Coding from {C}hinese Electronic Medical Records",
author = "Xu, Xinjie and
Fan, Yongqi and
Chen, Shuang-shuang and
Ye, Qi and
Guo, Weibin and
Hu, Xinxuan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.191/",
pages = "3927--3943",
ISBN = "979-8-89176-395-1",
abstract = "Accurate International Classification of Diseases (ICD) coding is crucial for hospital management and healthcare data governance. In clinical practice, straightforward cases can often be matched directly to ICD codes via diagnostic text, establishing retrieval-based methods as the baseline. More advanced approaches leverage large language models to rerank these results. However, real-world coding scenarios are typically more complex, demanding reasoning that goes beyond superficial descriptions. For instance, it involves synthesizing key information such as disease subtype, anatomical location, and complications from complex progress notes to accurately identify the primary diagnosis. However, a comprehensive evaluation framework for ICD coding based on complete EMRs is still lacking. To address these challenges, we constructed the Code4Detail dataset, which comprises 560 real clinical records covering 434 common diseases across 19 core chapters of ICD-10. To systematically explore the capability boundaries of large language models under different paradigms, we further propose the Travel on the ICD Tree (ToT-ICD) evaluation framework. Unlike the conventional retrieval-recall approach, ToT-ICD treats ICD coding as a structured exploration process across a hierarchical taxonomy. We design an agentic workflow that integrates similarity retrieval, path-guided navigation, and dynamic backtracking, enabling logical reasoning and decision-making under coding rules."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2026-travel">
<titleInfo>
<title>Travel on the ICD Tree: Benchmarking Agentic Reasoning for ICD Coding from Chinese Electronic Medical Records</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinjie</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongqi</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuang-shuang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weibin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinxuan</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Accurate International Classification of Diseases (ICD) coding is crucial for hospital management and healthcare data governance. In clinical practice, straightforward cases can often be matched directly to ICD codes via diagnostic text, establishing retrieval-based methods as the baseline. More advanced approaches leverage large language models to rerank these results. However, real-world coding scenarios are typically more complex, demanding reasoning that goes beyond superficial descriptions. For instance, it involves synthesizing key information such as disease subtype, anatomical location, and complications from complex progress notes to accurately identify the primary diagnosis. However, a comprehensive evaluation framework for ICD coding based on complete EMRs is still lacking. To address these challenges, we constructed the Code4Detail dataset, which comprises 560 real clinical records covering 434 common diseases across 19 core chapters of ICD-10. To systematically explore the capability boundaries of large language models under different paradigms, we further propose the Travel on the ICD Tree (ToT-ICD) evaluation framework. Unlike the conventional retrieval-recall approach, ToT-ICD treats ICD coding as a structured exploration process across a hierarchical taxonomy. We design an agentic workflow that integrates similarity retrieval, path-guided navigation, and dynamic backtracking, enabling logical reasoning and decision-making under coding rules.</abstract>
<identifier type="citekey">xu-etal-2026-travel</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.191/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>3927</start>
<end>3943</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Travel on the ICD Tree: Benchmarking Agentic Reasoning for ICD Coding from Chinese Electronic Medical Records
%A Xu, Xinjie
%A Fan, Yongqi
%A Chen, Shuang-shuang
%A Ye, Qi
%A Guo, Weibin
%A Hu, Xinxuan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F xu-etal-2026-travel
%X Accurate International Classification of Diseases (ICD) coding is crucial for hospital management and healthcare data governance. In clinical practice, straightforward cases can often be matched directly to ICD codes via diagnostic text, establishing retrieval-based methods as the baseline. More advanced approaches leverage large language models to rerank these results. However, real-world coding scenarios are typically more complex, demanding reasoning that goes beyond superficial descriptions. For instance, it involves synthesizing key information such as disease subtype, anatomical location, and complications from complex progress notes to accurately identify the primary diagnosis. However, a comprehensive evaluation framework for ICD coding based on complete EMRs is still lacking. To address these challenges, we constructed the Code4Detail dataset, which comprises 560 real clinical records covering 434 common diseases across 19 core chapters of ICD-10. To systematically explore the capability boundaries of large language models under different paradigms, we further propose the Travel on the ICD Tree (ToT-ICD) evaluation framework. Unlike the conventional retrieval-recall approach, ToT-ICD treats ICD coding as a structured exploration process across a hierarchical taxonomy. We design an agentic workflow that integrates similarity retrieval, path-guided navigation, and dynamic backtracking, enabling logical reasoning and decision-making under coding rules.
%U https://aclanthology.org/2026.findings-acl.191/
%P 3927-3943
Markdown (Informal)
[Travel on the ICD Tree: Benchmarking Agentic Reasoning for ICD Coding from Chinese Electronic Medical Records](https://aclanthology.org/2026.findings-acl.191/) (Xu et al., Findings 2026)
ACL