@inproceedings{spangher-etal-2025-chatbot,
title = "Chatbot Arena Estimate: towards a generalized performance benchmark for {LLM} capabilities",
author = "Spangher, Lucas and
Li, Tianle and
Arnold, William F. and
Masiewicki, Nick and
Dotiwalla, Xerxes and
Pasumarthi, Rama Kumar and
Grabowski, Peter and
Ie, Eugene and
Gruhl, Daniel",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-industry.77/",
doi = "10.18653/v1/2025.naacl-industry.77",
pages = "1016--1025",
ISBN = "979-8-89176-194-0",
abstract = "In industrial LLM development, evaluating large language models (LLMs) is critical for tasks like benchmarking internal models and detecting regressions during fine-tuning, but existing benchmark aggregation methods, such as Elo-based systems, can be resource-intensive, public facing, and time-consuming. Here, we describe \textbf{Chatbot Arena Estimate (CAE)}, a practical framework for aggregating performance across diverse benchmarks. The framework, developed and widely adopted within our organization, addresses the need for quick, accurate, and cost-efficient evaluations of LLMs. CAE generates two primary metrics: a ``Goodness'' score (answer accuracy) and a ``Fastness'' score (cost or queries per second, QPS). These metrics allow for model ranking both overall and within specific subdomains, enabling informed decisions during model iteration and deployment. We demonstrate CAE{'}s effectiveness by comparing it with existing benchmarks, including the full Chatbot Arena and the MMLU leaderboard. Notably, our approach achieves higher Pearson correlation with Chatbot Arena Elo scores than MMLU{'}s correlation with Chatbot Arena Elo scores, validating its reliability for real-world LLM evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="spangher-etal-2025-chatbot">
<titleInfo>
<title>Chatbot Arena Estimate: towards a generalized performance benchmark for LLM capabilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Spangher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianle</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Arnold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nick</namePart>
<namePart type="family">Masiewicki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xerxes</namePart>
<namePart type="family">Dotiwalla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rama</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Pasumarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Grabowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugene</namePart>
<namePart type="family">Ie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Gruhl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weizhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Kachuee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue-Yong</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-194-0</identifier>
</relatedItem>
<abstract>In industrial LLM development, evaluating large language models (LLMs) is critical for tasks like benchmarking internal models and detecting regressions during fine-tuning, but existing benchmark aggregation methods, such as Elo-based systems, can be resource-intensive, public facing, and time-consuming. Here, we describe Chatbot Arena Estimate (CAE), a practical framework for aggregating performance across diverse benchmarks. The framework, developed and widely adopted within our organization, addresses the need for quick, accurate, and cost-efficient evaluations of LLMs. CAE generates two primary metrics: a “Goodness” score (answer accuracy) and a “Fastness” score (cost or queries per second, QPS). These metrics allow for model ranking both overall and within specific subdomains, enabling informed decisions during model iteration and deployment. We demonstrate CAE’s effectiveness by comparing it with existing benchmarks, including the full Chatbot Arena and the MMLU leaderboard. Notably, our approach achieves higher Pearson correlation with Chatbot Arena Elo scores than MMLU’s correlation with Chatbot Arena Elo scores, validating its reliability for real-world LLM evaluation.</abstract>
<identifier type="citekey">spangher-etal-2025-chatbot</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-industry.77</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-industry.77/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1016</start>
<end>1025</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chatbot Arena Estimate: towards a generalized performance benchmark for LLM capabilities
%A Spangher, Lucas
%A Li, Tianle
%A Arnold, William F.
%A Masiewicki, Nick
%A Dotiwalla, Xerxes
%A Pasumarthi, Rama Kumar
%A Grabowski, Peter
%A Ie, Eugene
%A Gruhl, Daniel
%Y Chen, Weizhu
%Y Yang, Yi
%Y Kachuee, Mohammad
%Y Fu, Xue-Yong
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-194-0
%F spangher-etal-2025-chatbot
%X In industrial LLM development, evaluating large language models (LLMs) is critical for tasks like benchmarking internal models and detecting regressions during fine-tuning, but existing benchmark aggregation methods, such as Elo-based systems, can be resource-intensive, public facing, and time-consuming. Here, we describe Chatbot Arena Estimate (CAE), a practical framework for aggregating performance across diverse benchmarks. The framework, developed and widely adopted within our organization, addresses the need for quick, accurate, and cost-efficient evaluations of LLMs. CAE generates two primary metrics: a “Goodness” score (answer accuracy) and a “Fastness” score (cost or queries per second, QPS). These metrics allow for model ranking both overall and within specific subdomains, enabling informed decisions during model iteration and deployment. We demonstrate CAE’s effectiveness by comparing it with existing benchmarks, including the full Chatbot Arena and the MMLU leaderboard. Notably, our approach achieves higher Pearson correlation with Chatbot Arena Elo scores than MMLU’s correlation with Chatbot Arena Elo scores, validating its reliability for real-world LLM evaluation.
%R 10.18653/v1/2025.naacl-industry.77
%U https://aclanthology.org/2025.naacl-industry.77/
%U https://doi.org/10.18653/v1/2025.naacl-industry.77
%P 1016-1025
Markdown (Informal)
[Chatbot Arena Estimate: towards a generalized performance benchmark for LLM capabilities](https://aclanthology.org/2025.naacl-industry.77/) (Spangher et al., NAACL 2025)
ACL
- Lucas Spangher, Tianle Li, William F. Arnold, Nick Masiewicki, Xerxes Dotiwalla, Rama Kumar Pasumarthi, Peter Grabowski, Eugene Ie, and Daniel Gruhl. 2025. Chatbot Arena Estimate: towards a generalized performance benchmark for LLM capabilities. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track), pages 1016–1025, Albuquerque, New Mexico. Association for Computational Linguistics.