@inproceedings{zhou-etal-2025-credible,
title = "How Credible Is an Answer From Retrieval-Augmented {LLM}s? Investigation and Evaluation With Multi-Hop {QA}",
author = "Zhou, Yujia and
Liu, Zheng and
Dou, Zhicheng",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.285/",
pages = "4232--4242",
abstract = "Retrieval-augmented Large Language Models (RaLLMs) are reshaping knowledge acquisition, offering long-form, knowledge-grounded answers through advanced reasoning and generation capabilities. Despite the emergence of impactful systems like WebGPT and New Bing, the reliability of RaLLMs, especially in complex situations, is under scrutiny. Our study tackles this concern by evaluating RaLLMs' question-answering performance using a novel benchmark focusing on Correctness and Groundedness. Correctness measures the logical soundness of the responses, and Groundedness checks for support by relevant references. We introduce an automated model-based evaluation pipeline for multi-hop question-answering tasks, revealing RaLLMs' proneness to generating inaccuracies when dealing with flawed or partial knowledge. To improve accuracy, we introduce two reasoning strategies, Self-Reflection' and Self-Completion,' enabling RaLLMs to identify and fill knowledge gaps, significantly improving answer quality without extensive model retraining."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-credible">
<titleInfo>
<title>How Credible Is an Answer From Retrieval-Augmented LLMs? Investigation and Evaluation With Multi-Hop QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yujia</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhicheng</namePart>
<namePart type="family">Dou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Retrieval-augmented Large Language Models (RaLLMs) are reshaping knowledge acquisition, offering long-form, knowledge-grounded answers through advanced reasoning and generation capabilities. Despite the emergence of impactful systems like WebGPT and New Bing, the reliability of RaLLMs, especially in complex situations, is under scrutiny. Our study tackles this concern by evaluating RaLLMs’ question-answering performance using a novel benchmark focusing on Correctness and Groundedness. Correctness measures the logical soundness of the responses, and Groundedness checks for support by relevant references. We introduce an automated model-based evaluation pipeline for multi-hop question-answering tasks, revealing RaLLMs’ proneness to generating inaccuracies when dealing with flawed or partial knowledge. To improve accuracy, we introduce two reasoning strategies, Self-Reflection’ and Self-Completion,’ enabling RaLLMs to identify and fill knowledge gaps, significantly improving answer quality without extensive model retraining.</abstract>
<identifier type="citekey">zhou-etal-2025-credible</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.285/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4232</start>
<end>4242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Credible Is an Answer From Retrieval-Augmented LLMs? Investigation and Evaluation With Multi-Hop QA
%A Zhou, Yujia
%A Liu, Zheng
%A Dou, Zhicheng
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhou-etal-2025-credible
%X Retrieval-augmented Large Language Models (RaLLMs) are reshaping knowledge acquisition, offering long-form, knowledge-grounded answers through advanced reasoning and generation capabilities. Despite the emergence of impactful systems like WebGPT and New Bing, the reliability of RaLLMs, especially in complex situations, is under scrutiny. Our study tackles this concern by evaluating RaLLMs’ question-answering performance using a novel benchmark focusing on Correctness and Groundedness. Correctness measures the logical soundness of the responses, and Groundedness checks for support by relevant references. We introduce an automated model-based evaluation pipeline for multi-hop question-answering tasks, revealing RaLLMs’ proneness to generating inaccuracies when dealing with flawed or partial knowledge. To improve accuracy, we introduce two reasoning strategies, Self-Reflection’ and Self-Completion,’ enabling RaLLMs to identify and fill knowledge gaps, significantly improving answer quality without extensive model retraining.
%U https://aclanthology.org/2025.coling-main.285/
%P 4232-4242
Markdown (Informal)
[How Credible Is an Answer From Retrieval-Augmented LLMs? Investigation and Evaluation With Multi-Hop QA](https://aclanthology.org/2025.coling-main.285/) (Zhou et al., COLING 2025)
ACL