@inproceedings{hongli-etal-2024-mitigating,
title = "Mitigating the Bias of Large Language Model Evaluation",
author = "Hongli, Zhou and
Hui, Huang and
Yunfei, Long and
Bing, Xu and
Conghui, Zhu and
Hailong, Cao and
Muyun, Yang and
Tiejun, Zhao",
editor = "Sun, Maosong and
Liang, Jiye and
Han, Xianpei and
Liu, Zhiyuan and
He, Yulan",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.101/",
pages = "1310--1319",
language = "eng",
abstract = "{\textquotedblleft}Recently, there has been a trend of evaluating the Large Language Model (LLM) quality in theflavor of LLM-as-a-Judge, namely leveraging another LLM to evaluate the current output qual-ity. However, existing judges are proven to be biased, namely they would favor answers whichpresent better superficial quality (such as verbosity, fluency) while ignoring the instruction fol-lowing ability. In this work, we propose systematic research about the bias of LLM-as-a-Judge.Specifically, for closed-source judge models, we apply calibration to mitigate the significance ofsuperficial quality, both on probability level and prompt level. For open-source judge models, wepropose to mitigate the bias by contrastive training, with curated negative samples that deviatefrom instruction but present better superficial quality. We apply our methods on the bias evalu-ation benchmark, and experiment results show our methods mitigate the bias by a large marginwhile maintaining a satisfactory evaluation accuracy.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hongli-etal-2024-mitigating">
<titleInfo>
<title>Mitigating the Bias of Large Language Model Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Hongli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huang</namePart>
<namePart type="family">Hui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Long</namePart>
<namePart type="family">Yunfei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Bing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Conghui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cao</namePart>
<namePart type="family">Hailong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Muyun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Tiejun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiye</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Recently, there has been a trend of evaluating the Large Language Model (LLM) quality in theflavor of LLM-as-a-Judge, namely leveraging another LLM to evaluate the current output qual-ity. However, existing judges are proven to be biased, namely they would favor answers whichpresent better superficial quality (such as verbosity, fluency) while ignoring the instruction fol-lowing ability. In this work, we propose systematic research about the bias of LLM-as-a-Judge.Specifically, for closed-source judge models, we apply calibration to mitigate the significance ofsuperficial quality, both on probability level and prompt level. For open-source judge models, wepropose to mitigate the bias by contrastive training, with curated negative samples that deviatefrom instruction but present better superficial quality. We apply our methods on the bias evalu-ation benchmark, and experiment results show our methods mitigate the bias by a large marginwhile maintaining a satisfactory evaluation accuracy.”</abstract>
<identifier type="citekey">hongli-etal-2024-mitigating</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.101/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>1310</start>
<end>1319</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating the Bias of Large Language Model Evaluation
%A Hongli, Zhou
%A Hui, Huang
%A Yunfei, Long
%A Bing, Xu
%A Conghui, Zhu
%A Hailong, Cao
%A Muyun, Yang
%A Tiejun, Zhao
%Y Sun, Maosong
%Y Liang, Jiye
%Y Han, Xianpei
%Y Liu, Zhiyuan
%Y He, Yulan
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G eng
%F hongli-etal-2024-mitigating
%X “Recently, there has been a trend of evaluating the Large Language Model (LLM) quality in theflavor of LLM-as-a-Judge, namely leveraging another LLM to evaluate the current output qual-ity. However, existing judges are proven to be biased, namely they would favor answers whichpresent better superficial quality (such as verbosity, fluency) while ignoring the instruction fol-lowing ability. In this work, we propose systematic research about the bias of LLM-as-a-Judge.Specifically, for closed-source judge models, we apply calibration to mitigate the significance ofsuperficial quality, both on probability level and prompt level. For open-source judge models, wepropose to mitigate the bias by contrastive training, with curated negative samples that deviatefrom instruction but present better superficial quality. We apply our methods on the bias evalu-ation benchmark, and experiment results show our methods mitigate the bias by a large marginwhile maintaining a satisfactory evaluation accuracy.”
%U https://aclanthology.org/2024.ccl-1.101/
%P 1310-1319
Markdown (Informal)
[Mitigating the Bias of Large Language Model Evaluation](https://aclanthology.org/2024.ccl-1.101/) (Hongli et al., CCL 2024)
ACL
- Zhou Hongli, Huang Hui, Long Yunfei, Xu Bing, Zhu Conghui, Cao Hailong, Yang Muyun, and Zhao Tiejun. 2024. Mitigating the Bias of Large Language Model Evaluation. In Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference), pages 1310–1319, Taiyuan, China. Chinese Information Processing Society of China.