@inproceedings{yue-etal-2025-qaeval,
title = "{QAE}val: Mixture of Evaluators for Question-Answering Task Evaluation",
author = "Yue, Tan and
Mao, Rui and
Shi, Xuzhao and
Zhan, Shuo and
Yang, Zuhao and
Zhao, Dongyan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.716/",
doi = "10.18653/v1/2025.acl-long.716",
pages = "14717--14730",
ISBN = "979-8-89176-251-0",
abstract = "Question answering (QA) tasks serve as a key benchmark for evaluating generation systems. Traditional rule-based metrics, such as accuracy and relaxed-accuracy, struggle with open-ended and unstructured responses. LLM-based evaluation methods offer greater flexibility but suffer from sensitivity to instructions, robustness issues, and high computational costs. To overcome these challenges, we introduce QAEval, a hybrid framework combining rule-based reliability with LLM-based adaptability. QAEval utilizes two high-quality datasets: QAExtract for short-answer extraction and QAScore for scoring model training. By integrating a Mixture of Evaluators model with Dynamic Load Balancing Optimization, QAEval enables accurate, cost-effective QA evaluation. Experimental results show it outperforms models like GPT-4o and Claude-3, achieving 92.3{\%} accuracy with only 0.6B parameters."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yue-etal-2025-qaeval">
<titleInfo>
<title>QAEval: Mixture of Evaluators for Question-Answering Task Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tan</namePart>
<namePart type="family">Yue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuzhao</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuo</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zuhao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Question answering (QA) tasks serve as a key benchmark for evaluating generation systems. Traditional rule-based metrics, such as accuracy and relaxed-accuracy, struggle with open-ended and unstructured responses. LLM-based evaluation methods offer greater flexibility but suffer from sensitivity to instructions, robustness issues, and high computational costs. To overcome these challenges, we introduce QAEval, a hybrid framework combining rule-based reliability with LLM-based adaptability. QAEval utilizes two high-quality datasets: QAExtract for short-answer extraction and QAScore for scoring model training. By integrating a Mixture of Evaluators model with Dynamic Load Balancing Optimization, QAEval enables accurate, cost-effective QA evaluation. Experimental results show it outperforms models like GPT-4o and Claude-3, achieving 92.3% accuracy with only 0.6B parameters.</abstract>
<identifier type="citekey">yue-etal-2025-qaeval</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.716</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.716/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14717</start>
<end>14730</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QAEval: Mixture of Evaluators for Question-Answering Task Evaluation
%A Yue, Tan
%A Mao, Rui
%A Shi, Xuzhao
%A Zhan, Shuo
%A Yang, Zuhao
%A Zhao, Dongyan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F yue-etal-2025-qaeval
%X Question answering (QA) tasks serve as a key benchmark for evaluating generation systems. Traditional rule-based metrics, such as accuracy and relaxed-accuracy, struggle with open-ended and unstructured responses. LLM-based evaluation methods offer greater flexibility but suffer from sensitivity to instructions, robustness issues, and high computational costs. To overcome these challenges, we introduce QAEval, a hybrid framework combining rule-based reliability with LLM-based adaptability. QAEval utilizes two high-quality datasets: QAExtract for short-answer extraction and QAScore for scoring model training. By integrating a Mixture of Evaluators model with Dynamic Load Balancing Optimization, QAEval enables accurate, cost-effective QA evaluation. Experimental results show it outperforms models like GPT-4o and Claude-3, achieving 92.3% accuracy with only 0.6B parameters.
%R 10.18653/v1/2025.acl-long.716
%U https://aclanthology.org/2025.acl-long.716/
%U https://doi.org/10.18653/v1/2025.acl-long.716
%P 14717-14730
Markdown (Informal)
[QAEval: Mixture of Evaluators for Question-Answering Task Evaluation](https://aclanthology.org/2025.acl-long.716/) (Yue et al., ACL 2025)
ACL
- Tan Yue, Rui Mao, Xuzhao Shi, Shuo Zhan, Zuhao Yang, and Dongyan Zhao. 2025. QAEval: Mixture of Evaluators for Question-Answering Task Evaluation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 14717–14730, Vienna, Austria. Association for Computational Linguistics.