@inproceedings{lee-etal-2025-telagentbench,
title = "{T}el{A}gent{B}ench: A Multi-faceted Benchmark for Evaluating {LLM}-based Agents in Telecommunications",
author = "Lee, Sunwoo and
Jang, Daseong and
Arya, Dhammiko and
Han, Gyoung-eun and
Song, Injee and
Kim, SaeRom and
Kim, Sangjin and
Lee, Seojin and
Hong, Seokyoung and
Sek, Sereimony and
Cho, Seung-Mo and
Park, Sohee and
Yoon, Sungbin and
Jang, Wonbeom and
Davis, Eric",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.83/",
pages = "1173--1211",
ISBN = "979-8-89176-333-3",
abstract = "As Large Language Models (LLMs) evolve into powerful agentic systems, the telecommunications industry{'}s expansion into AI services necessitates industry-grounded benchmarks to evaluate their underexplored domain-specific capabilities. To address the gap left by generic benchmarks that fail to assess realistic, non-English performance, we present TelAgentBench, a Korean benchmark for the telecommunications domain evaluating five core agentic capabilities: Reasoning, Planning, Action (tool-use), Retrieval-Augmented Generation, and Instruction Following. Evaluations reveal significant performance disparities between models that employ explicit reasoning and those that do not, providing actionable insights for deploying agentic LLMs in real-world telecommunications tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2025-telagentbench">
<titleInfo>
<title>TelAgentBench: A Multi-faceted Benchmark for Evaluating LLM-based Agents in Telecommunications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunwoo</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daseong</namePart>
<namePart type="family">Jang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhammiko</namePart>
<namePart type="family">Arya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gyoung-eun</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injee</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SaeRom</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangjin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seojin</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokyoung</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sereimony</namePart>
<namePart type="family">Sek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Mo</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sohee</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungbin</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wonbeom</namePart>
<namePart type="family">Jang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>As Large Language Models (LLMs) evolve into powerful agentic systems, the telecommunications industry’s expansion into AI services necessitates industry-grounded benchmarks to evaluate their underexplored domain-specific capabilities. To address the gap left by generic benchmarks that fail to assess realistic, non-English performance, we present TelAgentBench, a Korean benchmark for the telecommunications domain evaluating five core agentic capabilities: Reasoning, Planning, Action (tool-use), Retrieval-Augmented Generation, and Instruction Following. Evaluations reveal significant performance disparities between models that employ explicit reasoning and those that do not, providing actionable insights for deploying agentic LLMs in real-world telecommunications tasks.</abstract>
<identifier type="citekey">lee-etal-2025-telagentbench</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.83/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1173</start>
<end>1211</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TelAgentBench: A Multi-faceted Benchmark for Evaluating LLM-based Agents in Telecommunications
%A Lee, Sunwoo
%A Jang, Daseong
%A Arya, Dhammiko
%A Han, Gyoung-eun
%A Song, Injee
%A Kim, SaeRom
%A Kim, Sangjin
%A Lee, Seojin
%A Hong, Seokyoung
%A Sek, Sereimony
%A Cho, Seung-Mo
%A Park, Sohee
%A Yoon, Sungbin
%A Jang, Wonbeom
%A Davis, Eric
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F lee-etal-2025-telagentbench
%X As Large Language Models (LLMs) evolve into powerful agentic systems, the telecommunications industry’s expansion into AI services necessitates industry-grounded benchmarks to evaluate their underexplored domain-specific capabilities. To address the gap left by generic benchmarks that fail to assess realistic, non-English performance, we present TelAgentBench, a Korean benchmark for the telecommunications domain evaluating five core agentic capabilities: Reasoning, Planning, Action (tool-use), Retrieval-Augmented Generation, and Instruction Following. Evaluations reveal significant performance disparities between models that employ explicit reasoning and those that do not, providing actionable insights for deploying agentic LLMs in real-world telecommunications tasks.
%U https://aclanthology.org/2025.emnlp-industry.83/
%P 1173-1211
Markdown (Informal)
[TelAgentBench: A Multi-faceted Benchmark for Evaluating LLM-based Agents in Telecommunications](https://aclanthology.org/2025.emnlp-industry.83/) (Lee et al., EMNLP 2025)
ACL
- Sunwoo Lee, Daseong Jang, Dhammiko Arya, Gyoung-eun Han, Injee Song, SaeRom Kim, Sangjin Kim, Seojin Lee, Seokyoung Hong, Sereimony Sek, Seung-Mo Cho, Sohee Park, Sungbin Yoon, Wonbeom Jang, and Eric Davis. 2025. TelAgentBench: A Multi-faceted Benchmark for Evaluating LLM-based Agents in Telecommunications. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1173–1211, Suzhou (China). Association for Computational Linguistics.