@inproceedings{mohan-etal-2026-retrieval,
title = "When Retrieval Hurts: Evidence Utilization, Script Fidelity, and Knowledge Conflicts in Multilingual {RAG}",
author = "Mohan, Varalekshmy M and
Jayakumar, Swathi and
Menon, Gadha Saji and
Kurup, Sachin and
G, Veena and
Kanjirangat, Vani",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mellm-1.9/",
pages = "92--107",
ISBN = "979-8-89176-430-9",
abstract = "The problem of extractive multilingual QA with LLMs is characterized by complex interactions among retrieval mechanisms, knowledge source configurations, prompting techniques, and scripting biases. Despite high retrieval quality, multilingual RAG often degrades performance, revealing a gap between retrieved evidence and its effective utilization. To address this issue, this paper offers an extensive empirical study that examines these components by comparing retrieval-augmented generation (RAG) with a non-RAG baseline across 21 typologically diverse languages and 5 leading LLMs. Our analysis includes five prompting strategies and multiple retrieval configurations, which enable a unified evaluation across diverse linguistic settings. We have also observed an evidence utilization gap in RAG settings, where RAG underperforms despite high retrieval hit rates due to models' inefficiency in leveraging the retrieved evidence. We also introduce lightweight inference-time metrics to better characterize retrieval usage and conflict patterns.We also highlight script fidelity as a crucial factor in our experiments, as non-Latin-script languages experience significant performance drops and increased hallucinations without proper grounding. Further, we analyzed generator language preferences, systematically examined conflicts, and identified mechanisms for the effective detection and resolution of conflicts. The study further details how prompting strategies affect language families and script types, offering a detailed analysis for optimizing future multilingual RAG settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohan-etal-2026-retrieval">
<titleInfo>
<title>When Retrieval Hurts: Evidence Utilization, Script Fidelity, and Knowledge Conflicts in Multilingual RAG</title>
</titleInfo>
<name type="personal">
<namePart type="given">Varalekshmy</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swathi</namePart>
<namePart type="family">Jayakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gadha</namePart>
<namePart type="given">Saji</namePart>
<namePart type="family">Menon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Kurup</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veena</namePart>
<namePart type="family">G</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vani</namePart>
<namePart type="family">Kanjirangat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaiyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengran</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-430-9</identifier>
</relatedItem>
<abstract>The problem of extractive multilingual QA with LLMs is characterized by complex interactions among retrieval mechanisms, knowledge source configurations, prompting techniques, and scripting biases. Despite high retrieval quality, multilingual RAG often degrades performance, revealing a gap between retrieved evidence and its effective utilization. To address this issue, this paper offers an extensive empirical study that examines these components by comparing retrieval-augmented generation (RAG) with a non-RAG baseline across 21 typologically diverse languages and 5 leading LLMs. Our analysis includes five prompting strategies and multiple retrieval configurations, which enable a unified evaluation across diverse linguistic settings. We have also observed an evidence utilization gap in RAG settings, where RAG underperforms despite high retrieval hit rates due to models’ inefficiency in leveraging the retrieved evidence. We also introduce lightweight inference-time metrics to better characterize retrieval usage and conflict patterns.We also highlight script fidelity as a crucial factor in our experiments, as non-Latin-script languages experience significant performance drops and increased hallucinations without proper grounding. Further, we analyzed generator language preferences, systematically examined conflicts, and identified mechanisms for the effective detection and resolution of conflicts. The study further details how prompting strategies affect language families and script types, offering a detailed analysis for optimizing future multilingual RAG settings.</abstract>
<identifier type="citekey">mohan-etal-2026-retrieval</identifier>
<location>
<url>https://aclanthology.org/2026.mellm-1.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>92</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Retrieval Hurts: Evidence Utilization, Script Fidelity, and Knowledge Conflicts in Multilingual RAG
%A Mohan, Varalekshmy M.
%A Jayakumar, Swathi
%A Menon, Gadha Saji
%A Kurup, Sachin
%A G, Veena
%A Kanjirangat, Vani
%Y Huang, Kaiyu
%Y Mo, Fengran
%Y Chen, Pinzhen
%Y Jiang, Meng
%S Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-430-9
%F mohan-etal-2026-retrieval
%X The problem of extractive multilingual QA with LLMs is characterized by complex interactions among retrieval mechanisms, knowledge source configurations, prompting techniques, and scripting biases. Despite high retrieval quality, multilingual RAG often degrades performance, revealing a gap between retrieved evidence and its effective utilization. To address this issue, this paper offers an extensive empirical study that examines these components by comparing retrieval-augmented generation (RAG) with a non-RAG baseline across 21 typologically diverse languages and 5 leading LLMs. Our analysis includes five prompting strategies and multiple retrieval configurations, which enable a unified evaluation across diverse linguistic settings. We have also observed an evidence utilization gap in RAG settings, where RAG underperforms despite high retrieval hit rates due to models’ inefficiency in leveraging the retrieved evidence. We also introduce lightweight inference-time metrics to better characterize retrieval usage and conflict patterns.We also highlight script fidelity as a crucial factor in our experiments, as non-Latin-script languages experience significant performance drops and increased hallucinations without proper grounding. Further, we analyzed generator language preferences, systematically examined conflicts, and identified mechanisms for the effective detection and resolution of conflicts. The study further details how prompting strategies affect language families and script types, offering a detailed analysis for optimizing future multilingual RAG settings.
%U https://aclanthology.org/2026.mellm-1.9/
%P 92-107
Markdown (Informal)
[When Retrieval Hurts: Evidence Utilization, Script Fidelity, and Knowledge Conflicts in Multilingual RAG](https://aclanthology.org/2026.mellm-1.9/) (Mohan et al., MeLLM 2026)
ACL
- Varalekshmy M Mohan, Swathi Jayakumar, Gadha Saji Menon, Sachin Kurup, Veena G, and Vani Kanjirangat. 2026. When Retrieval Hurts: Evidence Utilization, Script Fidelity, and Knowledge Conflicts in Multilingual RAG. In Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026), pages 92–107, San Diego, United States. Association for Computational Linguistics.