@inproceedings{zhang-etal-2025-automatic,
title = "An Automatic Method to Estimate Correctness of {RAG}",
author = "Zhang, Chi and
Datla, Vivek V. and
Shrivastava, Aditya and
Samuel, Alfy and
Huang, Zhiqi and
Kumar, Anoop and
Liu, Daben",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Darwish, Kareem and
Agarwal, Apoorv",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: Industry Track",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-industry.52/",
pages = "603--611",
abstract = "In sectors in where data quality is critical, like finance and healthcare, it is crucial to have confidence in not only the outputs generated by retrieval-augmented generation (RAG) models but also the process followed by the model while arriving at the output. Existing methods, such as hallucination detection and input-output entailment measurements, fail to capture the model`s internal state during answer generation. This paper introduces a novel approach to predict the correctness of the generated answer by modeling the model`s uncertainty on quantified perturbations of input. Extensive experiments across multiple large language models (LLMs) demonstrate that our approach quantifies RAG robustness by aligning predictions with ground truth with a Avg.Mean Square Error (MSE) 0.002 while offering flexibility for diverse qualitative metrics."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-automatic">
<titleInfo>
<title>An Automatic Method to Estimate Correctness of RAG</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Datla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Shrivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alfy</namePart>
<namePart type="family">Samuel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiqi</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daben</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apoorv</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In sectors in where data quality is critical, like finance and healthcare, it is crucial to have confidence in not only the outputs generated by retrieval-augmented generation (RAG) models but also the process followed by the model while arriving at the output. Existing methods, such as hallucination detection and input-output entailment measurements, fail to capture the model‘s internal state during answer generation. This paper introduces a novel approach to predict the correctness of the generated answer by modeling the model‘s uncertainty on quantified perturbations of input. Extensive experiments across multiple large language models (LLMs) demonstrate that our approach quantifies RAG robustness by aligning predictions with ground truth with a Avg.Mean Square Error (MSE) 0.002 while offering flexibility for diverse qualitative metrics.</abstract>
<identifier type="citekey">zhang-etal-2025-automatic</identifier>
<location>
<url>https://aclanthology.org/2025.coling-industry.52/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>603</start>
<end>611</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Automatic Method to Estimate Correctness of RAG
%A Zhang, Chi
%A Datla, Vivek V.
%A Shrivastava, Aditya
%A Samuel, Alfy
%A Huang, Zhiqi
%A Kumar, Anoop
%A Liu, Daben
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Darwish, Kareem
%Y Agarwal, Apoorv
%S Proceedings of the 31st International Conference on Computational Linguistics: Industry Track
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhang-etal-2025-automatic
%X In sectors in where data quality is critical, like finance and healthcare, it is crucial to have confidence in not only the outputs generated by retrieval-augmented generation (RAG) models but also the process followed by the model while arriving at the output. Existing methods, such as hallucination detection and input-output entailment measurements, fail to capture the model‘s internal state during answer generation. This paper introduces a novel approach to predict the correctness of the generated answer by modeling the model‘s uncertainty on quantified perturbations of input. Extensive experiments across multiple large language models (LLMs) demonstrate that our approach quantifies RAG robustness by aligning predictions with ground truth with a Avg.Mean Square Error (MSE) 0.002 while offering flexibility for diverse qualitative metrics.
%U https://aclanthology.org/2025.coling-industry.52/
%P 603-611
Markdown (Informal)
[An Automatic Method to Estimate Correctness of RAG](https://aclanthology.org/2025.coling-industry.52/) (Zhang et al., COLING 2025)
ACL
- Chi Zhang, Vivek V. Datla, Aditya Shrivastava, Alfy Samuel, Zhiqi Huang, Anoop Kumar, and Daben Liu. 2025. An Automatic Method to Estimate Correctness of RAG. In Proceedings of the 31st International Conference on Computational Linguistics: Industry Track, pages 603–611, Abu Dhabi, UAE. Association for Computational Linguistics.