@inproceedings{zheng-etal-2025-flageval,
title = "{F}lag{E}val-Arena: A Side-by-Side Comparative Evaluation Platform for Large Language Models and Text-Driven {AIGC}",
author = "Zheng, Jing-Shu and
Xuan, Richeng and
Qin, Bowen and
He, Zheqi and
Tongshuai.ren, Tongshuai.ren and
Li, Xuejing and
Yao, Jin-Ge and
Yang, Xi",
editor = "Mishra, Pushkar and
Muresan, Smaranda and
Yu, Tao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-demo.56/",
doi = "10.18653/v1/2025.acl-demo.56",
pages = "583--591",
ISBN = "979-8-89176-253-4",
abstract = "We introduce FlagEval-Arena, an evaluation platform for side-by-side comparisons of large language models and text-driven AIGC systems.Compared with the well-known LM Arena (LMSYS Chatbot Arena), we reimplement our own framework with the flexibility to introduce new mechanisms or features. Our platform enables side-by-side evaluation not only for language models or vision-language models, but also text-to-image or text-to-video synthesis. We specifically target at Chinese audience with a more focus on the Chinese language, more models developed by Chinese institutes, and more general usage beyond the technical community. As a result, we currently observe very interesting differences from usual results presented by LM Arena. Our platform is available via this URL: \url{https://flageval.baai.org/#/arena}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zheng-etal-2025-flageval">
<titleInfo>
<title>FlagEval-Arena: A Side-by-Side Comparative Evaluation Platform for Large Language Models and Text-Driven AIGC</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing-Shu</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richeng</namePart>
<namePart type="family">Xuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bowen</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheqi</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tongshuai.ren</namePart>
<namePart type="family">Tongshuai.ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuejing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jin-Ge</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushkar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-253-4</identifier>
</relatedItem>
<abstract>We introduce FlagEval-Arena, an evaluation platform for side-by-side comparisons of large language models and text-driven AIGC systems.Compared with the well-known LM Arena (LMSYS Chatbot Arena), we reimplement our own framework with the flexibility to introduce new mechanisms or features. Our platform enables side-by-side evaluation not only for language models or vision-language models, but also text-to-image or text-to-video synthesis. We specifically target at Chinese audience with a more focus on the Chinese language, more models developed by Chinese institutes, and more general usage beyond the technical community. As a result, we currently observe very interesting differences from usual results presented by LM Arena. Our platform is available via this URL: https://flageval.baai.org/#/arena.</abstract>
<identifier type="citekey">zheng-etal-2025-flageval</identifier>
<identifier type="doi">10.18653/v1/2025.acl-demo.56</identifier>
<location>
<url>https://aclanthology.org/2025.acl-demo.56/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>583</start>
<end>591</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FlagEval-Arena: A Side-by-Side Comparative Evaluation Platform for Large Language Models and Text-Driven AIGC
%A Zheng, Jing-Shu
%A Xuan, Richeng
%A Qin, Bowen
%A He, Zheqi
%A Tongshuai.ren, Tongshuai.ren
%A Li, Xuejing
%A Yao, Jin-Ge
%A Yang, Xi
%Y Mishra, Pushkar
%Y Muresan, Smaranda
%Y Yu, Tao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-253-4
%F zheng-etal-2025-flageval
%X We introduce FlagEval-Arena, an evaluation platform for side-by-side comparisons of large language models and text-driven AIGC systems.Compared with the well-known LM Arena (LMSYS Chatbot Arena), we reimplement our own framework with the flexibility to introduce new mechanisms or features. Our platform enables side-by-side evaluation not only for language models or vision-language models, but also text-to-image or text-to-video synthesis. We specifically target at Chinese audience with a more focus on the Chinese language, more models developed by Chinese institutes, and more general usage beyond the technical community. As a result, we currently observe very interesting differences from usual results presented by LM Arena. Our platform is available via this URL: https://flageval.baai.org/#/arena.
%R 10.18653/v1/2025.acl-demo.56
%U https://aclanthology.org/2025.acl-demo.56/
%U https://doi.org/10.18653/v1/2025.acl-demo.56
%P 583-591
Markdown (Informal)
[FlagEval-Arena: A Side-by-Side Comparative Evaluation Platform for Large Language Models and Text-Driven AIGC](https://aclanthology.org/2025.acl-demo.56/) (Zheng et al., ACL 2025)
ACL
- Jing-Shu Zheng, Richeng Xuan, Bowen Qin, Zheqi He, Tongshuai.ren Tongshuai.ren, Xuejing Li, Jin-Ge Yao, and Xi Yang. 2025. FlagEval-Arena: A Side-by-Side Comparative Evaluation Platform for Large Language Models and Text-Driven AIGC. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), pages 583–591, Vienna, Austria. Association for Computational Linguistics.