@inproceedings{damerla-etal-2025-textbandit,
title = "{T}ext{B}andit: Evaluating Probabilistic Reasoning in {LLM}s Through Language-Only Decision Tasks",
author = "Damerla, Arjun and
Lim, Jimin and
Jiang, Yanxi and
Le, Nam Nguyen Hoai and
Selladurai, Nikil",
editor = "Premasiri, Damith and
Ranasinghe, Tharindu and
Hettiarachchi, Hansi",
booktitle = "Proceedings of the First Workshop on Ethical Concerns in Training, Evaluating and Deploying Large Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ethicalllms-1.1/",
pages = "1--8",
abstract = "Large language models (LLMs) have shown to be increasingly capable of performing reasoning tasks, but their ability to make sequential decisions under uncertainty only using natural language remains under-explored. We introduce a novel benchmark in which LLMs interact with multi-armed bandit environments using purely textual feedback, ``you earned a token'', without access to numerical cues or explicit probabilities, resulting in the model to infer latent reward structures purely off linguistic cues and to adapt accordingly. We evaluated the performance of four open-source LLMs and compare their performance to standard decision-making algorithms such as Thompson Sampling, Epsilon Greedy, Upper Confidence Bound (UCB), and random choice. While most of the LLMs underperformed compared to the baselines, Qwen3-4B, achieved the best-arm selection rate of 89.2{\%} , which significantly outperformed both the larger LLMs and traditional methods. Our findings suggest that probabilistic reasoning is able to emerge from language alone, and we present this benchmark as a step towards evaluating decision-making capabilities in naturalistic, non-numeric contexts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="damerla-etal-2025-textbandit">
<titleInfo>
<title>TextBandit: Evaluating Probabilistic Reasoning in LLMs Through Language-Only Decision Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Damerla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimin</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanxi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nam</namePart>
<namePart type="given">Nguyen</namePart>
<namePart type="given">Hoai</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikil</namePart>
<namePart type="family">Selladurai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Ethical Concerns in Training, Evaluating and Deploying Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) have shown to be increasingly capable of performing reasoning tasks, but their ability to make sequential decisions under uncertainty only using natural language remains under-explored. We introduce a novel benchmark in which LLMs interact with multi-armed bandit environments using purely textual feedback, “you earned a token”, without access to numerical cues or explicit probabilities, resulting in the model to infer latent reward structures purely off linguistic cues and to adapt accordingly. We evaluated the performance of four open-source LLMs and compare their performance to standard decision-making algorithms such as Thompson Sampling, Epsilon Greedy, Upper Confidence Bound (UCB), and random choice. While most of the LLMs underperformed compared to the baselines, Qwen3-4B, achieved the best-arm selection rate of 89.2% , which significantly outperformed both the larger LLMs and traditional methods. Our findings suggest that probabilistic reasoning is able to emerge from language alone, and we present this benchmark as a step towards evaluating decision-making capabilities in naturalistic, non-numeric contexts.</abstract>
<identifier type="citekey">damerla-etal-2025-textbandit</identifier>
<location>
<url>https://aclanthology.org/2025.ethicalllms-1.1/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1</start>
<end>8</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TextBandit: Evaluating Probabilistic Reasoning in LLMs Through Language-Only Decision Tasks
%A Damerla, Arjun
%A Lim, Jimin
%A Jiang, Yanxi
%A Le, Nam Nguyen Hoai
%A Selladurai, Nikil
%Y Premasiri, Damith
%Y Ranasinghe, Tharindu
%Y Hettiarachchi, Hansi
%S Proceedings of the First Workshop on Ethical Concerns in Training, Evaluating and Deploying Large Language Models
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F damerla-etal-2025-textbandit
%X Large language models (LLMs) have shown to be increasingly capable of performing reasoning tasks, but their ability to make sequential decisions under uncertainty only using natural language remains under-explored. We introduce a novel benchmark in which LLMs interact with multi-armed bandit environments using purely textual feedback, “you earned a token”, without access to numerical cues or explicit probabilities, resulting in the model to infer latent reward structures purely off linguistic cues and to adapt accordingly. We evaluated the performance of four open-source LLMs and compare their performance to standard decision-making algorithms such as Thompson Sampling, Epsilon Greedy, Upper Confidence Bound (UCB), and random choice. While most of the LLMs underperformed compared to the baselines, Qwen3-4B, achieved the best-arm selection rate of 89.2% , which significantly outperformed both the larger LLMs and traditional methods. Our findings suggest that probabilistic reasoning is able to emerge from language alone, and we present this benchmark as a step towards evaluating decision-making capabilities in naturalistic, non-numeric contexts.
%U https://aclanthology.org/2025.ethicalllms-1.1/
%P 1-8
Markdown (Informal)
[TextBandit: Evaluating Probabilistic Reasoning in LLMs Through Language-Only Decision Tasks](https://aclanthology.org/2025.ethicalllms-1.1/) (Damerla et al., EthicalLLMs 2025)
ACL
- Arjun Damerla, Jimin Lim, Yanxi Jiang, Nam Nguyen Hoai Le, and Nikil Selladurai. 2025. TextBandit: Evaluating Probabilistic Reasoning in LLMs Through Language-Only Decision Tasks. In Proceedings of the First Workshop on Ethical Concerns in Training, Evaluating and Deploying Large Language Models, pages 1–8, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.