@inproceedings{zhang-etal-2023-zhujiu,
title = "{Z}hu{J}iu: A Multi-dimensional, Multi-faceted {C}hinese Benchmark for Large Language Models",
author = "Zhang, Baoli and
Xie, Haining and
Du, Pengfan and
Chen, Junhao and
Cao, Pengfei and
Chen, Yubo and
Liu, Shengping and
Liu, Kang and
Zhao, Jun",
editor = "Feng, Yansong and
Lefever, Els",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-demo.44",
doi = "10.18653/v1/2023.emnlp-demo.44",
pages = "479--494",
abstract = "The unprecedented performance of LLMs requires comprehensive and accurate evaluation. We argue that for LLMs evaluation, benchmarks need to be comprehensive and systematic. To this end, we propose the Zhujiu benchmark, which has the following strengths: (1) Multi-dimensional ability coverage: We comprehensively evaluate LLMs across 7 ability dimensions covering 51 tasks. Especially, we also propose a new benchmark that focus on knowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration: We use 3 different yet complementary evaluation methods to comprehensively evaluate LLMs, which can ensure the authority and accuracy of the evaluation results. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering benchmark that fully assesses LLMs in Chinese, while also providing equally robust evaluation abilities in English. (4) Avoiding potential data leakage: To avoid data leakage, we construct evaluation data specifically for 37 tasks. We evaluate 10 current mainstream LLMs, and conduct an in-depth discussion and analysis of their results. The ZhuJiu benchmark and open-participation leaderboard are publicly released at \url{http://www.zhujiu-benchmark.com} and we also provide a demo video at \url{https://youtu.be/qypkJ89L1Ic.}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2023-zhujiu">
<titleInfo>
<title>ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baoli</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haining</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengfan</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengfei</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yubo</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengping</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yansong</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Els</namePart>
<namePart type="family">Lefever</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The unprecedented performance of LLMs requires comprehensive and accurate evaluation. We argue that for LLMs evaluation, benchmarks need to be comprehensive and systematic. To this end, we propose the Zhujiu benchmark, which has the following strengths: (1) Multi-dimensional ability coverage: We comprehensively evaluate LLMs across 7 ability dimensions covering 51 tasks. Especially, we also propose a new benchmark that focus on knowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration: We use 3 different yet complementary evaluation methods to comprehensively evaluate LLMs, which can ensure the authority and accuracy of the evaluation results. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering benchmark that fully assesses LLMs in Chinese, while also providing equally robust evaluation abilities in English. (4) Avoiding potential data leakage: To avoid data leakage, we construct evaluation data specifically for 37 tasks. We evaluate 10 current mainstream LLMs, and conduct an in-depth discussion and analysis of their results. The ZhuJiu benchmark and open-participation leaderboard are publicly released at http://www.zhujiu-benchmark.com and we also provide a demo video at https://youtu.be/qypkJ89L1Ic.</abstract>
<identifier type="citekey">zhang-etal-2023-zhujiu</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-demo.44</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-demo.44</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>479</start>
<end>494</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large Language Models
%A Zhang, Baoli
%A Xie, Haining
%A Du, Pengfan
%A Chen, Junhao
%A Cao, Pengfei
%A Chen, Yubo
%A Liu, Shengping
%A Liu, Kang
%A Zhao, Jun
%Y Feng, Yansong
%Y Lefever, Els
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F zhang-etal-2023-zhujiu
%X The unprecedented performance of LLMs requires comprehensive and accurate evaluation. We argue that for LLMs evaluation, benchmarks need to be comprehensive and systematic. To this end, we propose the Zhujiu benchmark, which has the following strengths: (1) Multi-dimensional ability coverage: We comprehensively evaluate LLMs across 7 ability dimensions covering 51 tasks. Especially, we also propose a new benchmark that focus on knowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration: We use 3 different yet complementary evaluation methods to comprehensively evaluate LLMs, which can ensure the authority and accuracy of the evaluation results. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering benchmark that fully assesses LLMs in Chinese, while also providing equally robust evaluation abilities in English. (4) Avoiding potential data leakage: To avoid data leakage, we construct evaluation data specifically for 37 tasks. We evaluate 10 current mainstream LLMs, and conduct an in-depth discussion and analysis of their results. The ZhuJiu benchmark and open-participation leaderboard are publicly released at http://www.zhujiu-benchmark.com and we also provide a demo video at https://youtu.be/qypkJ89L1Ic.
%R 10.18653/v1/2023.emnlp-demo.44
%U https://aclanthology.org/2023.emnlp-demo.44
%U https://doi.org/10.18653/v1/2023.emnlp-demo.44
%P 479-494
Markdown (Informal)
[ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large Language Models](https://aclanthology.org/2023.emnlp-demo.44) (Zhang et al., EMNLP 2023)
ACL
- Baoli Zhang, Haining Xie, Pengfan Du, Junhao Chen, Pengfei Cao, Yubo Chen, Shengping Liu, Kang Liu, and Jun Zhao. 2023. ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large Language Models. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 479–494, Singapore. Association for Computational Linguistics.