@inproceedings{chen-etal-2025-llmspark,
title = "{LLM}s{P}ark: A Benchmark for Evaluating Large Language Models in Strategic Gaming Contexts",
author = "Chen, Junhao and
Sun, Jingbo and
Li, Xiang and
Xin, Haidong and
Xue, Yuhao and
Xu, Yibin and
Zhao, Hao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.12/",
pages = "182--194",
ISBN = "979-8-89176-335-7",
abstract = "As large language models (LLMs) advance across diverse tasks, the need for comprehensive evaluation beyond single metrics becomes increasingly important.To fully assess LLM intelligence, it is crucial to examine their interactive dynamics and strategic behaviors.We present LLMsPark, a game theory{--}based evaluation platform that measures LLMs' decision-making strategies and social behaviors in classic game-theoretic settings, providing a multi-agent environment to explore strategic depth.Our system cross-evaluates 15 leading LLMs (both commercial and open-source) using leaderboard rankings and scoring mechanisms. Higher scores reflect stronger reasoning and strategic capabilities, revealing distinct behavioral patterns and performance differences across models.This work introduces a novel perspective for evaluating LLMs' strategic intelligence, enriching existing benchmarks and broadening their assessment in interactive, game-theoretic scenarios.The benchmark and rankings are publicly available at https://llmsparks.github.io/."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-llmspark">
<titleInfo>
<title>LLMsPark: A Benchmark for Evaluating Large Language Models in Strategic Gaming Contexts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingbo</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haidong</namePart>
<namePart type="family">Xin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhao</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yibin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>As large language models (LLMs) advance across diverse tasks, the need for comprehensive evaluation beyond single metrics becomes increasingly important.To fully assess LLM intelligence, it is crucial to examine their interactive dynamics and strategic behaviors.We present LLMsPark, a game theory–based evaluation platform that measures LLMs’ decision-making strategies and social behaviors in classic game-theoretic settings, providing a multi-agent environment to explore strategic depth.Our system cross-evaluates 15 leading LLMs (both commercial and open-source) using leaderboard rankings and scoring mechanisms. Higher scores reflect stronger reasoning and strategic capabilities, revealing distinct behavioral patterns and performance differences across models.This work introduces a novel perspective for evaluating LLMs’ strategic intelligence, enriching existing benchmarks and broadening their assessment in interactive, game-theoretic scenarios.The benchmark and rankings are publicly available at https://llmsparks.github.io/.</abstract>
<identifier type="citekey">chen-etal-2025-llmspark</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.12/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>182</start>
<end>194</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMsPark: A Benchmark for Evaluating Large Language Models in Strategic Gaming Contexts
%A Chen, Junhao
%A Sun, Jingbo
%A Li, Xiang
%A Xin, Haidong
%A Xue, Yuhao
%A Xu, Yibin
%A Zhao, Hao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F chen-etal-2025-llmspark
%X As large language models (LLMs) advance across diverse tasks, the need for comprehensive evaluation beyond single metrics becomes increasingly important.To fully assess LLM intelligence, it is crucial to examine their interactive dynamics and strategic behaviors.We present LLMsPark, a game theory–based evaluation platform that measures LLMs’ decision-making strategies and social behaviors in classic game-theoretic settings, providing a multi-agent environment to explore strategic depth.Our system cross-evaluates 15 leading LLMs (both commercial and open-source) using leaderboard rankings and scoring mechanisms. Higher scores reflect stronger reasoning and strategic capabilities, revealing distinct behavioral patterns and performance differences across models.This work introduces a novel perspective for evaluating LLMs’ strategic intelligence, enriching existing benchmarks and broadening their assessment in interactive, game-theoretic scenarios.The benchmark and rankings are publicly available at https://llmsparks.github.io/.
%U https://aclanthology.org/2025.findings-emnlp.12/
%P 182-194
Markdown (Informal)
[LLMsPark: A Benchmark for Evaluating Large Language Models in Strategic Gaming Contexts](https://aclanthology.org/2025.findings-emnlp.12/) (Chen et al., Findings 2025)
ACL