@inproceedings{tuli-etal-2026-mindflayer,
title = "{M}ind{F}layer at {S}em{E}val-2026 Task 8:{DUALRAG}:Answerability-Aware Generation for Multi-Turn {RAG} Conversations",
author = "Tuli, Jerin Romijah and
Pritom, Md. Sartaj Alam and
Naem, Talukder Naemul Hasan",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.293/",
pages = "2314--2321",
ISBN = "979-8-89176-414-9",
abstract = "Our system, DualRAG (team MindFlayer), tackles SemEval-2026 Task 8 Subtask B - generating faithful responses in multi-turn RAG conversations. The core idea is simple: before generating anything, we first check whether reference passages exist for the current question. If they do, we route through a domain-guided generation prompt that instructs the model to answer using only those passages. If they do not, we route through a strict refusal prompt that tells the model to politely decline rather than guess.We used Meta{'}s Llama-4-Scout-17B through the Groq API, with no training or fine-tuning - purely zero-shot prompting. A lightweight post-processing layer catches the rare cases where the model ignores its instructions: if it refuses when passages are available, we replace the response with a neutral fallback; if it answers when no passages exist, we replace it with a standard refusal. Out of 507 test tasks, only 7 needed this correction.The system ranked 8th out of 26 teams with a harmonic mean of 0.7492, beating the strongest baseline (GPT-OSS-120B at 0.639) by a notable margin. The standout result is 100{\%} refusal accuracy on all 130 unanswerable questions - something even GPT-4o and Llama 3.1 405B failed to achieve consistently according to prior work. Our RLF score of 0.8782 shows the responses stay tightly grounded in the reference passages. The relatively lower RBagg (0.6024) reflects the challenge of matching human-written phrasing in a zero-shot setting, which we identify as the clearest direction for improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tuli-etal-2026-mindflayer">
<titleInfo>
<title>MindFlayer at SemEval-2026 Task 8:DUALRAG:Answerability-Aware Generation for Multi-Turn RAG Conversations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jerin</namePart>
<namePart type="given">Romijah</namePart>
<namePart type="family">Tuli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Sartaj</namePart>
<namePart type="given">Alam</namePart>
<namePart type="family">Pritom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Talukder</namePart>
<namePart type="given">Naemul</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Naem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>Our system, DualRAG (team MindFlayer), tackles SemEval-2026 Task 8 Subtask B - generating faithful responses in multi-turn RAG conversations. The core idea is simple: before generating anything, we first check whether reference passages exist for the current question. If they do, we route through a domain-guided generation prompt that instructs the model to answer using only those passages. If they do not, we route through a strict refusal prompt that tells the model to politely decline rather than guess.We used Meta’s Llama-4-Scout-17B through the Groq API, with no training or fine-tuning - purely zero-shot prompting. A lightweight post-processing layer catches the rare cases where the model ignores its instructions: if it refuses when passages are available, we replace the response with a neutral fallback; if it answers when no passages exist, we replace it with a standard refusal. Out of 507 test tasks, only 7 needed this correction.The system ranked 8th out of 26 teams with a harmonic mean of 0.7492, beating the strongest baseline (GPT-OSS-120B at 0.639) by a notable margin. The standout result is 100% refusal accuracy on all 130 unanswerable questions - something even GPT-4o and Llama 3.1 405B failed to achieve consistently according to prior work. Our RLF score of 0.8782 shows the responses stay tightly grounded in the reference passages. The relatively lower RBagg (0.6024) reflects the challenge of matching human-written phrasing in a zero-shot setting, which we identify as the clearest direction for improvement.</abstract>
<identifier type="citekey">tuli-etal-2026-mindflayer</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.293/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>2314</start>
<end>2321</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MindFlayer at SemEval-2026 Task 8:DUALRAG:Answerability-Aware Generation for Multi-Turn RAG Conversations
%A Tuli, Jerin Romijah
%A Pritom, Md. Sartaj Alam
%A Naem, Talukder Naemul Hasan
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F tuli-etal-2026-mindflayer
%X Our system, DualRAG (team MindFlayer), tackles SemEval-2026 Task 8 Subtask B - generating faithful responses in multi-turn RAG conversations. The core idea is simple: before generating anything, we first check whether reference passages exist for the current question. If they do, we route through a domain-guided generation prompt that instructs the model to answer using only those passages. If they do not, we route through a strict refusal prompt that tells the model to politely decline rather than guess.We used Meta’s Llama-4-Scout-17B through the Groq API, with no training or fine-tuning - purely zero-shot prompting. A lightweight post-processing layer catches the rare cases where the model ignores its instructions: if it refuses when passages are available, we replace the response with a neutral fallback; if it answers when no passages exist, we replace it with a standard refusal. Out of 507 test tasks, only 7 needed this correction.The system ranked 8th out of 26 teams with a harmonic mean of 0.7492, beating the strongest baseline (GPT-OSS-120B at 0.639) by a notable margin. The standout result is 100% refusal accuracy on all 130 unanswerable questions - something even GPT-4o and Llama 3.1 405B failed to achieve consistently according to prior work. Our RLF score of 0.8782 shows the responses stay tightly grounded in the reference passages. The relatively lower RBagg (0.6024) reflects the challenge of matching human-written phrasing in a zero-shot setting, which we identify as the clearest direction for improvement.
%U https://aclanthology.org/2026.semeval-1.293/
%P 2314-2321
Markdown (Informal)
[MindFlayer at SemEval-2026 Task 8:DUALRAG:Answerability-Aware Generation for Multi-Turn RAG Conversations](https://aclanthology.org/2026.semeval-1.293/) (Tuli et al., SemEval 2026)
ACL