@inproceedings{zhou-etal-2024-revisiting,
title = "Revisiting the Self-Consistency Challenges in Multi-Choice Question Formats for Large Language Model Evaluation",
author = "Zhou, Wenjie and
Wang, Qiang and
Xu, Mingzhou and
Chen, Ming and
Duan, Xiangyu",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1229/",
pages = "14103--14110",
abstract = "Multi-choice questions (MCQ) are a common method for assessing the world knowledge of large language models (LLMs), demonstrated by benchmarks such as MMLU and C-Eval. However, recent findings indicate that even top-tier LLMs, such as ChatGPT and GPT4, might display inconsistencies when faced with slightly varied inputs. This raises concerns about the credibility of MCQ-based evaluations. To address this issue, we introduced three knowledge-equivalent question variants: option position shuffle, option label replacement, and conversion to a True/False format. We rigorously tested a range of LLMs, varying in model size (from 6B to 70B) and types{---}pretrained language model (PLM), supervised fine-tuning (SFT), and reinforcement learning from human feedback (RLHF). Our findings from MMLU and C-Eval revealed that accuracy for individual questions lacks robustness, particularly in smaller models ({\ensuremath{<}}30B) and PLMs. Consequently, we advocate that consistent accuracy may serve as a more reliable metric for evaluating and ranking LLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2024-revisiting">
<titleInfo>
<title>Revisiting the Self-Consistency Challenges in Multi-Choice Question Formats for Large Language Model Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingzhou</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangyu</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multi-choice questions (MCQ) are a common method for assessing the world knowledge of large language models (LLMs), demonstrated by benchmarks such as MMLU and C-Eval. However, recent findings indicate that even top-tier LLMs, such as ChatGPT and GPT4, might display inconsistencies when faced with slightly varied inputs. This raises concerns about the credibility of MCQ-based evaluations. To address this issue, we introduced three knowledge-equivalent question variants: option position shuffle, option label replacement, and conversion to a True/False format. We rigorously tested a range of LLMs, varying in model size (from 6B to 70B) and types—pretrained language model (PLM), supervised fine-tuning (SFT), and reinforcement learning from human feedback (RLHF). Our findings from MMLU and C-Eval revealed that accuracy for individual questions lacks robustness, particularly in smaller models (\ensuremath<30B) and PLMs. Consequently, we advocate that consistent accuracy may serve as a more reliable metric for evaluating and ranking LLMs.</abstract>
<identifier type="citekey">zhou-etal-2024-revisiting</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1229/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>14103</start>
<end>14110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Revisiting the Self-Consistency Challenges in Multi-Choice Question Formats for Large Language Model Evaluation
%A Zhou, Wenjie
%A Wang, Qiang
%A Xu, Mingzhou
%A Chen, Ming
%A Duan, Xiangyu
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F zhou-etal-2024-revisiting
%X Multi-choice questions (MCQ) are a common method for assessing the world knowledge of large language models (LLMs), demonstrated by benchmarks such as MMLU and C-Eval. However, recent findings indicate that even top-tier LLMs, such as ChatGPT and GPT4, might display inconsistencies when faced with slightly varied inputs. This raises concerns about the credibility of MCQ-based evaluations. To address this issue, we introduced three knowledge-equivalent question variants: option position shuffle, option label replacement, and conversion to a True/False format. We rigorously tested a range of LLMs, varying in model size (from 6B to 70B) and types—pretrained language model (PLM), supervised fine-tuning (SFT), and reinforcement learning from human feedback (RLHF). Our findings from MMLU and C-Eval revealed that accuracy for individual questions lacks robustness, particularly in smaller models (\ensuremath<30B) and PLMs. Consequently, we advocate that consistent accuracy may serve as a more reliable metric for evaluating and ranking LLMs.
%U https://aclanthology.org/2024.lrec-main.1229/
%P 14103-14110
Markdown (Informal)
[Revisiting the Self-Consistency Challenges in Multi-Choice Question Formats for Large Language Model Evaluation](https://aclanthology.org/2024.lrec-main.1229/) (Zhou et al., LREC-COLING 2024)
ACL