@inproceedings{hakimov-etal-2025-ad,
title = "Ad-hoc Concept Forming in the Game Codenames as a Means for Evaluating Large Language Models",
author = "Hakimov, Sherzod and
Pfennigschmidt, Lara and
Schlangen, David",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.63/",
pages = "728--740",
ISBN = "979-8-89176-261-9",
abstract = "This study utilizes the game Codenames as a benchmarking tool to evaluate large language models (LLMs) with respect to specific linguistic and cognitive skills. LLMs play each side of the game, where one side generates a clue word covering several target words and the other guesses those target words. We designed various experiments by controlling the choice of words (abstract vs. concrete words, ambiguous vs. monosemic) or the opponent (programmed to be faster or slower in revealing words). Recent commercial and open-weight models were compared side-by-side to find out factors affecting their performance. The evaluation reveals details about their strategies, challenging cases, and limitations of LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hakimov-etal-2025-ad">
<titleInfo>
<title>Ad-hoc Concept Forming in the Game Codenames as a Means for Evaluating Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sherzod</namePart>
<namePart type="family">Hakimov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lara</namePart>
<namePart type="family">Pfennigschmidt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Schlangen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>This study utilizes the game Codenames as a benchmarking tool to evaluate large language models (LLMs) with respect to specific linguistic and cognitive skills. LLMs play each side of the game, where one side generates a clue word covering several target words and the other guesses those target words. We designed various experiments by controlling the choice of words (abstract vs. concrete words, ambiguous vs. monosemic) or the opponent (programmed to be faster or slower in revealing words). Recent commercial and open-weight models were compared side-by-side to find out factors affecting their performance. The evaluation reveals details about their strategies, challenging cases, and limitations of LLMs.</abstract>
<identifier type="citekey">hakimov-etal-2025-ad</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.63/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>728</start>
<end>740</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ad-hoc Concept Forming in the Game Codenames as a Means for Evaluating Large Language Models
%A Hakimov, Sherzod
%A Pfennigschmidt, Lara
%A Schlangen, David
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F hakimov-etal-2025-ad
%X This study utilizes the game Codenames as a benchmarking tool to evaluate large language models (LLMs) with respect to specific linguistic and cognitive skills. LLMs play each side of the game, where one side generates a clue word covering several target words and the other guesses those target words. We designed various experiments by controlling the choice of words (abstract vs. concrete words, ambiguous vs. monosemic) or the opponent (programmed to be faster or slower in revealing words). Recent commercial and open-weight models were compared side-by-side to find out factors affecting their performance. The evaluation reveals details about their strategies, challenging cases, and limitations of LLMs.
%U https://aclanthology.org/2025.gem-1.63/
%P 728-740
Markdown (Informal)
[Ad-hoc Concept Forming in the Game Codenames as a Means for Evaluating Large Language Models](https://aclanthology.org/2025.gem-1.63/) (Hakimov et al., GEM 2025)
ACL