@inproceedings{alyahya-etal-2025-zerosumeval,
title = "{Z}ero{S}um{E}val: An Extensible Framework For Scaling {LLM} Evaluation with Inter-Model Competition",
author = "Alyahya, Hisham Abdullah and
Khan, Haidar and
Alnumay, Yazeed and
Bari, M Saiful and
Yener, Bulent",
editor = "Mishra, Pushkar and
Muresan, Smaranda and
Yu, Tao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-demo.33/",
doi = "10.18653/v1/2025.acl-demo.33",
pages = "340--350",
ISBN = "979-8-89176-253-4",
abstract = "We introduce ZeroSumEval, a dynamic, competition-based, and evolving evaluation framework for Large Language Models (LLMs) that leverages competitive games. ZeroSumEval encompasses a diverse suite of games, including security challenges (Capture the Flag), classic board games (chess), and knowledge tests (MathQuiz). These games are designed to evaluate a range of capabilities such as strategic reasoning, planning, knowledge application, safety, and adaptability. Building upon recent studies that highlight the effectiveness of game-based evaluations for LLMs, ZeroSumEval enhances these approaches by providing a standardized and extensible framework for easily implementing games and leverages DSPy to provide a better abstraction for LLM player strategies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alyahya-etal-2025-zerosumeval">
<titleInfo>
<title>ZeroSumEval: An Extensible Framework For Scaling LLM Evaluation with Inter-Model Competition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hisham</namePart>
<namePart type="given">Abdullah</namePart>
<namePart type="family">Alyahya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haidar</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yazeed</namePart>
<namePart type="family">Alnumay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">M</namePart>
<namePart type="given">Saiful</namePart>
<namePart type="family">Bari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bulent</namePart>
<namePart type="family">Yener</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushkar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-253-4</identifier>
</relatedItem>
<abstract>We introduce ZeroSumEval, a dynamic, competition-based, and evolving evaluation framework for Large Language Models (LLMs) that leverages competitive games. ZeroSumEval encompasses a diverse suite of games, including security challenges (Capture the Flag), classic board games (chess), and knowledge tests (MathQuiz). These games are designed to evaluate a range of capabilities such as strategic reasoning, planning, knowledge application, safety, and adaptability. Building upon recent studies that highlight the effectiveness of game-based evaluations for LLMs, ZeroSumEval enhances these approaches by providing a standardized and extensible framework for easily implementing games and leverages DSPy to provide a better abstraction for LLM player strategies.</abstract>
<identifier type="citekey">alyahya-etal-2025-zerosumeval</identifier>
<identifier type="doi">10.18653/v1/2025.acl-demo.33</identifier>
<location>
<url>https://aclanthology.org/2025.acl-demo.33/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>340</start>
<end>350</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ZeroSumEval: An Extensible Framework For Scaling LLM Evaluation with Inter-Model Competition
%A Alyahya, Hisham Abdullah
%A Khan, Haidar
%A Alnumay, Yazeed
%A Bari, M. Saiful
%A Yener, Bulent
%Y Mishra, Pushkar
%Y Muresan, Smaranda
%Y Yu, Tao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-253-4
%F alyahya-etal-2025-zerosumeval
%X We introduce ZeroSumEval, a dynamic, competition-based, and evolving evaluation framework for Large Language Models (LLMs) that leverages competitive games. ZeroSumEval encompasses a diverse suite of games, including security challenges (Capture the Flag), classic board games (chess), and knowledge tests (MathQuiz). These games are designed to evaluate a range of capabilities such as strategic reasoning, planning, knowledge application, safety, and adaptability. Building upon recent studies that highlight the effectiveness of game-based evaluations for LLMs, ZeroSumEval enhances these approaches by providing a standardized and extensible framework for easily implementing games and leverages DSPy to provide a better abstraction for LLM player strategies.
%R 10.18653/v1/2025.acl-demo.33
%U https://aclanthology.org/2025.acl-demo.33/
%U https://doi.org/10.18653/v1/2025.acl-demo.33
%P 340-350
Markdown (Informal)
[ZeroSumEval: An Extensible Framework For Scaling LLM Evaluation with Inter-Model Competition](https://aclanthology.org/2025.acl-demo.33/) (Alyahya et al., ACL 2025)
ACL