@inproceedings{derunets-etal-2026-raguteam,
title = "{R}agu{T}eam at {S}em{E}val-2026 Task 8: Meno and {F}riends in a Judge-Orchestrated {LLM} Ensemble for Faithful Multi-Turn Response Generation",
author = "Derunets, Roman and
Bondarenko, Ivan and
Sedukhin, Oleg and
Komarov, Mikhail and
Chernov, Ivan and
Kulakov, Mikhail",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.216/",
pages = "1678--1694",
ISBN = "979-8-89176-414-9",
abstract = "This paper describes our first-place submission to Task B (generation with reference passages) of the SemEval-2026 Task 8 MTRAGEval shared task on multi-turn retrieval-augmented generation. We propose a heterogeneous ensemble of seven LLMs organised into two groups with distinct prompting strategies, and use a GPT-4o-mini judge to select the best candidate response for each instance. Our system ranked first among 26 teams, achieving a conditioned harmonic mean score of 0.78 and substantially outperforming the strongest organiser baseline (0.64). Ablation experiments show that diversity across model families, scales, and prompting strategies is critical: the ensemble consistently outperforms any individual model. We also include Meno-Lite-0.1, a 7B domain-adapted model with a favourable cost{--}performance trade-off, and present an analysis of MTRAGEval that highlights annotation limitations and directions for benchmark improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="derunets-etal-2026-raguteam">
<titleInfo>
<title>RaguTeam at SemEval-2026 Task 8: Meno and Friends in a Judge-Orchestrated LLM Ensemble for Faithful Multi-Turn Response Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Derunets</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Bondarenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Sedukhin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikhail</namePart>
<namePart type="family">Komarov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Chernov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikhail</namePart>
<namePart type="family">Kulakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>This paper describes our first-place submission to Task B (generation with reference passages) of the SemEval-2026 Task 8 MTRAGEval shared task on multi-turn retrieval-augmented generation. We propose a heterogeneous ensemble of seven LLMs organised into two groups with distinct prompting strategies, and use a GPT-4o-mini judge to select the best candidate response for each instance. Our system ranked first among 26 teams, achieving a conditioned harmonic mean score of 0.78 and substantially outperforming the strongest organiser baseline (0.64). Ablation experiments show that diversity across model families, scales, and prompting strategies is critical: the ensemble consistently outperforms any individual model. We also include Meno-Lite-0.1, a 7B domain-adapted model with a favourable cost–performance trade-off, and present an analysis of MTRAGEval that highlights annotation limitations and directions for benchmark improvement.</abstract>
<identifier type="citekey">derunets-etal-2026-raguteam</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.216/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1678</start>
<end>1694</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RaguTeam at SemEval-2026 Task 8: Meno and Friends in a Judge-Orchestrated LLM Ensemble for Faithful Multi-Turn Response Generation
%A Derunets, Roman
%A Bondarenko, Ivan
%A Sedukhin, Oleg
%A Komarov, Mikhail
%A Chernov, Ivan
%A Kulakov, Mikhail
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F derunets-etal-2026-raguteam
%X This paper describes our first-place submission to Task B (generation with reference passages) of the SemEval-2026 Task 8 MTRAGEval shared task on multi-turn retrieval-augmented generation. We propose a heterogeneous ensemble of seven LLMs organised into two groups with distinct prompting strategies, and use a GPT-4o-mini judge to select the best candidate response for each instance. Our system ranked first among 26 teams, achieving a conditioned harmonic mean score of 0.78 and substantially outperforming the strongest organiser baseline (0.64). Ablation experiments show that diversity across model families, scales, and prompting strategies is critical: the ensemble consistently outperforms any individual model. We also include Meno-Lite-0.1, a 7B domain-adapted model with a favourable cost–performance trade-off, and present an analysis of MTRAGEval that highlights annotation limitations and directions for benchmark improvement.
%U https://aclanthology.org/2026.semeval-1.216/
%P 1678-1694
Markdown (Informal)
[RaguTeam at SemEval-2026 Task 8: Meno and Friends in a Judge-Orchestrated LLM Ensemble for Faithful Multi-Turn Response Generation](https://aclanthology.org/2026.semeval-1.216/) (Derunets et al., SemEval 2026)
ACL