@inproceedings{rakshit-flanigan-2025-multi,
title = "Multi-{LLM} Verification for Question Answering under Conflicting Contexts",
author = "Rakshit, Geetanjali and
Flanigan, Jeffrey",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.116/",
pages = "1012--1021",
abstract = "Open-domain question answering (ODQA) often requires models to resolve conflicting evidence retrieved from diverse sources{---}a task that remains challenging even for state-of-the-art large language models (LLMs). While single-agent techniques such as self-verification and self-consistency have shown promise across natural language understanding and generation tasks, and multi-agent approaches involving collaborative or competitive strategies have recently emerged, their effectiveness for ODQA in the presence of conflicting contexts remains underexplored. In this work, we investigate these techniques using the QACC dataset as a case study. We find that incorporating a multi-agent verification step{---}where the best answer is selected from among outputs generated by different LLMs{---}leads to improved performance. Interestingly, we also observe that requiring explanations during the verification step does not always improve answer quality. Our experiments evaluate three strong LLMs (GPT-4o, Claude 4, and DeepSeek-R1) across a range of prompting and verification baselines."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rakshit-flanigan-2025-multi">
<titleInfo>
<title>Multi-LLM Verification for Question Answering under Conflicting Contexts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Geetanjali</namePart>
<namePart type="family">Rakshit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeffrey</namePart>
<namePart type="family">Flanigan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Open-domain question answering (ODQA) often requires models to resolve conflicting evidence retrieved from diverse sources—a task that remains challenging even for state-of-the-art large language models (LLMs). While single-agent techniques such as self-verification and self-consistency have shown promise across natural language understanding and generation tasks, and multi-agent approaches involving collaborative or competitive strategies have recently emerged, their effectiveness for ODQA in the presence of conflicting contexts remains underexplored. In this work, we investigate these techniques using the QACC dataset as a case study. We find that incorporating a multi-agent verification step—where the best answer is selected from among outputs generated by different LLMs—leads to improved performance. Interestingly, we also observe that requiring explanations during the verification step does not always improve answer quality. Our experiments evaluate three strong LLMs (GPT-4o, Claude 4, and DeepSeek-R1) across a range of prompting and verification baselines.</abstract>
<identifier type="citekey">rakshit-flanigan-2025-multi</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.116/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1012</start>
<end>1021</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-LLM Verification for Question Answering under Conflicting Contexts
%A Rakshit, Geetanjali
%A Flanigan, Jeffrey
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F rakshit-flanigan-2025-multi
%X Open-domain question answering (ODQA) often requires models to resolve conflicting evidence retrieved from diverse sources—a task that remains challenging even for state-of-the-art large language models (LLMs). While single-agent techniques such as self-verification and self-consistency have shown promise across natural language understanding and generation tasks, and multi-agent approaches involving collaborative or competitive strategies have recently emerged, their effectiveness for ODQA in the presence of conflicting contexts remains underexplored. In this work, we investigate these techniques using the QACC dataset as a case study. We find that incorporating a multi-agent verification step—where the best answer is selected from among outputs generated by different LLMs—leads to improved performance. Interestingly, we also observe that requiring explanations during the verification step does not always improve answer quality. Our experiments evaluate three strong LLMs (GPT-4o, Claude 4, and DeepSeek-R1) across a range of prompting and verification baselines.
%U https://aclanthology.org/2025.ranlp-1.116/
%P 1012-1021
Markdown (Informal)
[Multi-LLM Verification for Question Answering under Conflicting Contexts](https://aclanthology.org/2025.ranlp-1.116/) (Rakshit & Flanigan, RANLP 2025)
ACL