@inproceedings{xu-etal-2025-tree,
title = "Tree-of-Quote Prompting Improves Factuality and Attribution in Multi-Hop and Medical Reasoning",
author = "Xu, Justin and
Li, Yiming and
Zhang, Zizheng and
Luk, Augustine Yui Hei and
Jobanputra, Mayank and
Oza, Samarth and
Murray, Ashley and
Kasula, Meghana Reddy and
Parker, Andrew and
Eyre, David W",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.285/",
doi = "10.18653/v1/2025.emnlp-main.285",
pages = "5605--5622",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) can produce fluent but factually incorrect outputs and often have limited ability to attribute their claims to source material. This undermines their reliability, particularly in multi-hop and high-stakes domains such as medicine. We propose Tree-of-Quote (ToQ) prompting, a structured framework that decomposes complex questions into subquestions, generates quotes to support each step without retrieval, and selectively advances reasoning based on quote quality. We also introduce FQ-Score, a unified metric that captures answer correctness, attribution fidelity, and reasoning quality. Experiments on StrategyQA, 2WikiMultiHopQA, MuSiQue, MoreHopQA, and MedQA demonstrate that ToQ improves factuality and attribution over standard prompting baselines. To validate FQ-Score as a proxy for human judgment, we conduct two reader studies with clinicians on medical questions, and observe strong correlations. Both clinician scores and FQ-Scores also indicate a preference for ToQ over baselines due to a combination of greater correctness, completeness, and logical flow. Our results suggest ToQ is a promising approach for building more trustworthy and auditable LLM systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2025-tree">
<titleInfo>
<title>Tree-of-Quote Prompting Improves Factuality and Attribution in Multi-Hop and Medical Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zizheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Augustine</namePart>
<namePart type="given">Yui</namePart>
<namePart type="given">Hei</namePart>
<namePart type="family">Luk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayank</namePart>
<namePart type="family">Jobanputra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samarth</namePart>
<namePart type="family">Oza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashley</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meghana</namePart>
<namePart type="given">Reddy</namePart>
<namePart type="family">Kasula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Parker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">W</namePart>
<namePart type="family">Eyre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) can produce fluent but factually incorrect outputs and often have limited ability to attribute their claims to source material. This undermines their reliability, particularly in multi-hop and high-stakes domains such as medicine. We propose Tree-of-Quote (ToQ) prompting, a structured framework that decomposes complex questions into subquestions, generates quotes to support each step without retrieval, and selectively advances reasoning based on quote quality. We also introduce FQ-Score, a unified metric that captures answer correctness, attribution fidelity, and reasoning quality. Experiments on StrategyQA, 2WikiMultiHopQA, MuSiQue, MoreHopQA, and MedQA demonstrate that ToQ improves factuality and attribution over standard prompting baselines. To validate FQ-Score as a proxy for human judgment, we conduct two reader studies with clinicians on medical questions, and observe strong correlations. Both clinician scores and FQ-Scores also indicate a preference for ToQ over baselines due to a combination of greater correctness, completeness, and logical flow. Our results suggest ToQ is a promising approach for building more trustworthy and auditable LLM systems.</abstract>
<identifier type="citekey">xu-etal-2025-tree</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.285</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.285/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>5605</start>
<end>5622</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tree-of-Quote Prompting Improves Factuality and Attribution in Multi-Hop and Medical Reasoning
%A Xu, Justin
%A Li, Yiming
%A Zhang, Zizheng
%A Luk, Augustine Yui Hei
%A Jobanputra, Mayank
%A Oza, Samarth
%A Murray, Ashley
%A Kasula, Meghana Reddy
%A Parker, Andrew
%A Eyre, David W.
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F xu-etal-2025-tree
%X Large language models (LLMs) can produce fluent but factually incorrect outputs and often have limited ability to attribute their claims to source material. This undermines their reliability, particularly in multi-hop and high-stakes domains such as medicine. We propose Tree-of-Quote (ToQ) prompting, a structured framework that decomposes complex questions into subquestions, generates quotes to support each step without retrieval, and selectively advances reasoning based on quote quality. We also introduce FQ-Score, a unified metric that captures answer correctness, attribution fidelity, and reasoning quality. Experiments on StrategyQA, 2WikiMultiHopQA, MuSiQue, MoreHopQA, and MedQA demonstrate that ToQ improves factuality and attribution over standard prompting baselines. To validate FQ-Score as a proxy for human judgment, we conduct two reader studies with clinicians on medical questions, and observe strong correlations. Both clinician scores and FQ-Scores also indicate a preference for ToQ over baselines due to a combination of greater correctness, completeness, and logical flow. Our results suggest ToQ is a promising approach for building more trustworthy and auditable LLM systems.
%R 10.18653/v1/2025.emnlp-main.285
%U https://aclanthology.org/2025.emnlp-main.285/
%U https://doi.org/10.18653/v1/2025.emnlp-main.285
%P 5605-5622
Markdown (Informal)
[Tree-of-Quote Prompting Improves Factuality and Attribution in Multi-Hop and Medical Reasoning](https://aclanthology.org/2025.emnlp-main.285/) (Xu et al., EMNLP 2025)
ACL
- Justin Xu, Yiming Li, Zizheng Zhang, Augustine Yui Hei Luk, Mayank Jobanputra, Samarth Oza, Ashley Murray, Meghana Reddy Kasula, Andrew Parker, and David W Eyre. 2025. Tree-of-Quote Prompting Improves Factuality and Attribution in Multi-Hop and Medical Reasoning. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 5605–5622, Suzhou, China. Association for Computational Linguistics.