@inproceedings{jain-etal-2025-autoeval,
title = "{A}uto{E}val-{T}o{D}: Automated Evaluation of Task-oriented Dialog Systems",
author = "Jain, Arihant and
Aggarwal, Purav and
Sahay, Rishav and
Dong, Chaosheng and
Saladi, Anoop",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.508/",
doi = "10.18653/v1/2025.naacl-long.508",
pages = "10133--10148",
ISBN = "979-8-89176-189-6",
abstract = "Task-oriented Dialog systems (ToD) are essential in automating user interactions, but their complex design and dynamic nature make evaluation particularly challenging. Current evaluation methodologies heavily depend on human annotators, which can be inefficient, subjective, and expensive to scale. To advance the field, there is a pressing need for a reliable, scalable, and systematic evaluation framework that can provide comprehensive insights into ToD system performance. In this paper, we propose, AutoEval-TOD, an automated end-to-end evaluation framework using large language models (LLMs). Our framework first interacts with the ToD system and then assesses its performance across key dimensions by analyzing both the ToD{'}s responses and internal states. We validate our approach by applying it to multiple ToD systems, highlighting its adaptability and potential for widespread use in both research and industrial settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jain-etal-2025-autoeval">
<titleInfo>
<title>AutoEval-ToD: Automated Evaluation of Task-oriented Dialog Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arihant</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Purav</namePart>
<namePart type="family">Aggarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishav</namePart>
<namePart type="family">Sahay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaosheng</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Saladi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Task-oriented Dialog systems (ToD) are essential in automating user interactions, but their complex design and dynamic nature make evaluation particularly challenging. Current evaluation methodologies heavily depend on human annotators, which can be inefficient, subjective, and expensive to scale. To advance the field, there is a pressing need for a reliable, scalable, and systematic evaluation framework that can provide comprehensive insights into ToD system performance. In this paper, we propose, AutoEval-TOD, an automated end-to-end evaluation framework using large language models (LLMs). Our framework first interacts with the ToD system and then assesses its performance across key dimensions by analyzing both the ToD’s responses and internal states. We validate our approach by applying it to multiple ToD systems, highlighting its adaptability and potential for widespread use in both research and industrial settings.</abstract>
<identifier type="citekey">jain-etal-2025-autoeval</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.508</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.508/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>10133</start>
<end>10148</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AutoEval-ToD: Automated Evaluation of Task-oriented Dialog Systems
%A Jain, Arihant
%A Aggarwal, Purav
%A Sahay, Rishav
%A Dong, Chaosheng
%A Saladi, Anoop
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F jain-etal-2025-autoeval
%X Task-oriented Dialog systems (ToD) are essential in automating user interactions, but their complex design and dynamic nature make evaluation particularly challenging. Current evaluation methodologies heavily depend on human annotators, which can be inefficient, subjective, and expensive to scale. To advance the field, there is a pressing need for a reliable, scalable, and systematic evaluation framework that can provide comprehensive insights into ToD system performance. In this paper, we propose, AutoEval-TOD, an automated end-to-end evaluation framework using large language models (LLMs). Our framework first interacts with the ToD system and then assesses its performance across key dimensions by analyzing both the ToD’s responses and internal states. We validate our approach by applying it to multiple ToD systems, highlighting its adaptability and potential for widespread use in both research and industrial settings.
%R 10.18653/v1/2025.naacl-long.508
%U https://aclanthology.org/2025.naacl-long.508/
%U https://doi.org/10.18653/v1/2025.naacl-long.508
%P 10133-10148
Markdown (Informal)
[AutoEval-ToD: Automated Evaluation of Task-oriented Dialog Systems](https://aclanthology.org/2025.naacl-long.508/) (Jain et al., NAACL 2025)
ACL
- Arihant Jain, Purav Aggarwal, Rishav Sahay, Chaosheng Dong, and Anoop Saladi. 2025. AutoEval-ToD: Automated Evaluation of Task-oriented Dialog Systems. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 10133–10148, Albuquerque, New Mexico. Association for Computational Linguistics.