@inproceedings{parvez-2025-chain,
title = "Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning",
author = "Parvez, Md Rizwan",
editor = "Shi, Weijia and
Yu, Wenhao and
Asai, Akari and
Jiang, Meng and
Durrett, Greg and
Hajishirzi, Hannaneh and
Zettlemoyer, Luke",
booktitle = "Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.knowledgenlp-1.21/",
doi = "10.18653/v1/2025.knowledgenlp-1.21",
pages = "230--245",
ISBN = "979-8-89176-229-9",
abstract = "While chain-of-thoughts (CoT) prompting has revolutionized how LLMs perform reasoning tasks, its current methods and variations (e.g, Self-consistency, ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR) etc.,) suffer from limitations like limited context grounding, hallucination/inconsistent output generation, and iterative sluggishness. To overcome these challenges, we introduce a novel mono/dual-step zero-shot prompting framework built upon two unique strategies \textbf{Chain of Evidences (CoE)} and \textbf{Evidence to Generate (E2G)}. Instead of unverified reasoning claims, our innovative approaches leverage the power of ``evidence for decision making'' by first focusing exclusively on the thought sequences explicitly mentioned in the context which then serve as extracted evidence, guiding the LLM{'}s output generation process with greater precision and efficiency. This simple yet potent approach unlocks the full potential of chain-of-thoughts prompting, facilitating faster, more reliable, and contextually aware reasoning in LLMs. Our framework consistently achieves remarkable results across various knowledge-intensive reasoning and generation tasks, surpassing baseline approaches with state-of-the-art LLMs. For instance, (i) on the LogiQA benchmark using GPT-4, CoE achieves a new state-of-the-art accuracy of 53.8{\%}, surpassing CoT by 18{\%}, ToT by 11{\%}, and CR by 9{\%}; (ii) CoE with PaLM-2 outperforms the variable-shot performance of Gemini Ultra by 0.9 F1 points, achieving an F1 score of 83.3 on DROP. We release our prompts and outputs on these benchmarks as a new instruction tuning dataset for future research at \textit{Hugging Face}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parvez-2025-chain">
<titleInfo>
<title>Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Rizwan</namePart>
<namePart type="family">Parvez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weijia</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akari</namePart>
<namePart type="family">Asai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-229-9</identifier>
</relatedItem>
<abstract>While chain-of-thoughts (CoT) prompting has revolutionized how LLMs perform reasoning tasks, its current methods and variations (e.g, Self-consistency, ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR) etc.,) suffer from limitations like limited context grounding, hallucination/inconsistent output generation, and iterative sluggishness. To overcome these challenges, we introduce a novel mono/dual-step zero-shot prompting framework built upon two unique strategies Chain of Evidences (CoE) and Evidence to Generate (E2G). Instead of unverified reasoning claims, our innovative approaches leverage the power of “evidence for decision making” by first focusing exclusively on the thought sequences explicitly mentioned in the context which then serve as extracted evidence, guiding the LLM’s output generation process with greater precision and efficiency. This simple yet potent approach unlocks the full potential of chain-of-thoughts prompting, facilitating faster, more reliable, and contextually aware reasoning in LLMs. Our framework consistently achieves remarkable results across various knowledge-intensive reasoning and generation tasks, surpassing baseline approaches with state-of-the-art LLMs. For instance, (i) on the LogiQA benchmark using GPT-4, CoE achieves a new state-of-the-art accuracy of 53.8%, surpassing CoT by 18%, ToT by 11%, and CR by 9%; (ii) CoE with PaLM-2 outperforms the variable-shot performance of Gemini Ultra by 0.9 F1 points, achieving an F1 score of 83.3 on DROP. We release our prompts and outputs on these benchmarks as a new instruction tuning dataset for future research at Hugging Face.</abstract>
<identifier type="citekey">parvez-2025-chain</identifier>
<identifier type="doi">10.18653/v1/2025.knowledgenlp-1.21</identifier>
<location>
<url>https://aclanthology.org/2025.knowledgenlp-1.21/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>230</start>
<end>245</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning
%A Parvez, Md Rizwan
%Y Shi, Weijia
%Y Yu, Wenhao
%Y Asai, Akari
%Y Jiang, Meng
%Y Durrett, Greg
%Y Hajishirzi, Hannaneh
%Y Zettlemoyer, Luke
%S Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, USA
%@ 979-8-89176-229-9
%F parvez-2025-chain
%X While chain-of-thoughts (CoT) prompting has revolutionized how LLMs perform reasoning tasks, its current methods and variations (e.g, Self-consistency, ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR) etc.,) suffer from limitations like limited context grounding, hallucination/inconsistent output generation, and iterative sluggishness. To overcome these challenges, we introduce a novel mono/dual-step zero-shot prompting framework built upon two unique strategies Chain of Evidences (CoE) and Evidence to Generate (E2G). Instead of unverified reasoning claims, our innovative approaches leverage the power of “evidence for decision making” by first focusing exclusively on the thought sequences explicitly mentioned in the context which then serve as extracted evidence, guiding the LLM’s output generation process with greater precision and efficiency. This simple yet potent approach unlocks the full potential of chain-of-thoughts prompting, facilitating faster, more reliable, and contextually aware reasoning in LLMs. Our framework consistently achieves remarkable results across various knowledge-intensive reasoning and generation tasks, surpassing baseline approaches with state-of-the-art LLMs. For instance, (i) on the LogiQA benchmark using GPT-4, CoE achieves a new state-of-the-art accuracy of 53.8%, surpassing CoT by 18%, ToT by 11%, and CR by 9%; (ii) CoE with PaLM-2 outperforms the variable-shot performance of Gemini Ultra by 0.9 F1 points, achieving an F1 score of 83.3 on DROP. We release our prompts and outputs on these benchmarks as a new instruction tuning dataset for future research at Hugging Face.
%R 10.18653/v1/2025.knowledgenlp-1.21
%U https://aclanthology.org/2025.knowledgenlp-1.21/
%U https://doi.org/10.18653/v1/2025.knowledgenlp-1.21
%P 230-245
Markdown (Informal)
[Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning](https://aclanthology.org/2025.knowledgenlp-1.21/) (Parvez, KnowledgeNLP 2025)
ACL