@inproceedings{mendonca-etal-2025-overview,
title = "Overview of Dialog System Evaluation Track: Dimensionality, Language, Culture and Safety at {DSTC} 12",
author = "Mendon{\c{c}}a, John and
Zhang, Lining and
Mallidi, Rahul and
Lavie, Alon and
Trancoso, Isabel and
D{'}Haro, Luis Fernando and
Sedoc, Jo{\~a}o",
editor = "Hedayatnia, Behnam and
Chen, Vivian and
Chen, Zhang and
Gupta, Raghav and
Galley, Michel",
booktitle = "Proceedings of the Twelfth Dialog System Technology Challenge",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.dstc-1.3/",
pages = "27--35",
ISBN = "979-8-89176-330-2",
abstract = "The rapid advancement of Large Language Models (LLMs) has intensified the need for robust dialogue system evaluation, yet comprehensive assessment remains challenging. Traditional metrics often prove insufficient, and safety considerations are frequently narrowly defined or culturally biased. The DSTC12 Track 1, ``Dialog System Evaluation: Dimensionality, Language, Culture and Safety,'' is part of the ongoing effort to address these critical gaps. The track comprised two subtasks: (1) Dialogue-level, Multi-dimensional Automatic Evaluation Metrics, and (2) Multilingual and Multicultural Safety Detection. For Task 1, focused on 10 dialogue dimensions, a Llama-3-8B baseline achieved the highest average Spearman{'}s correlation (0.1681), indicating substantial room for improvement. In Task 2, while participating teams significantly outperformed a Llama-Guard-3-1B baseline on the multilingual safety subset (top ROC-AUC 0.9648), the baseline proved superior on the cultural subset (0.5126 ROC-AUC), highlighting critical needs in culturally-aware safety. This paper describes the datasets and baselines provided to participants, as well as submission evaluation results for each of the two proposed subtasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mendonca-etal-2025-overview">
<titleInfo>
<title>Overview of Dialog System Evaluation Track: Dimensionality, Language, Culture and Safety at DSTC 12</title>
</titleInfo>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lining</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Mallidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Lavie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabel</namePart>
<namePart type="family">Trancoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Dialog System Technology Challenge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Behnam</namePart>
<namePart type="family">Hedayatnia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivian</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raghav</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Galley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-330-2</identifier>
</relatedItem>
<abstract>The rapid advancement of Large Language Models (LLMs) has intensified the need for robust dialogue system evaluation, yet comprehensive assessment remains challenging. Traditional metrics often prove insufficient, and safety considerations are frequently narrowly defined or culturally biased. The DSTC12 Track 1, “Dialog System Evaluation: Dimensionality, Language, Culture and Safety,” is part of the ongoing effort to address these critical gaps. The track comprised two subtasks: (1) Dialogue-level, Multi-dimensional Automatic Evaluation Metrics, and (2) Multilingual and Multicultural Safety Detection. For Task 1, focused on 10 dialogue dimensions, a Llama-3-8B baseline achieved the highest average Spearman’s correlation (0.1681), indicating substantial room for improvement. In Task 2, while participating teams significantly outperformed a Llama-Guard-3-1B baseline on the multilingual safety subset (top ROC-AUC 0.9648), the baseline proved superior on the cultural subset (0.5126 ROC-AUC), highlighting critical needs in culturally-aware safety. This paper describes the datasets and baselines provided to participants, as well as submission evaluation results for each of the two proposed subtasks.</abstract>
<identifier type="citekey">mendonca-etal-2025-overview</identifier>
<location>
<url>https://aclanthology.org/2025.dstc-1.3/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>27</start>
<end>35</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Overview of Dialog System Evaluation Track: Dimensionality, Language, Culture and Safety at DSTC 12
%A Mendonça, John
%A Zhang, Lining
%A Mallidi, Rahul
%A Lavie, Alon
%A Trancoso, Isabel
%A D’Haro, Luis Fernando
%A Sedoc, João
%Y Hedayatnia, Behnam
%Y Chen, Vivian
%Y Chen, Zhang
%Y Gupta, Raghav
%Y Galley, Michel
%S Proceedings of the Twelfth Dialog System Technology Challenge
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%@ 979-8-89176-330-2
%F mendonca-etal-2025-overview
%X The rapid advancement of Large Language Models (LLMs) has intensified the need for robust dialogue system evaluation, yet comprehensive assessment remains challenging. Traditional metrics often prove insufficient, and safety considerations are frequently narrowly defined or culturally biased. The DSTC12 Track 1, “Dialog System Evaluation: Dimensionality, Language, Culture and Safety,” is part of the ongoing effort to address these critical gaps. The track comprised two subtasks: (1) Dialogue-level, Multi-dimensional Automatic Evaluation Metrics, and (2) Multilingual and Multicultural Safety Detection. For Task 1, focused on 10 dialogue dimensions, a Llama-3-8B baseline achieved the highest average Spearman’s correlation (0.1681), indicating substantial room for improvement. In Task 2, while participating teams significantly outperformed a Llama-Guard-3-1B baseline on the multilingual safety subset (top ROC-AUC 0.9648), the baseline proved superior on the cultural subset (0.5126 ROC-AUC), highlighting critical needs in culturally-aware safety. This paper describes the datasets and baselines provided to participants, as well as submission evaluation results for each of the two proposed subtasks.
%U https://aclanthology.org/2025.dstc-1.3/
%P 27-35
Markdown (Informal)
[Overview of Dialog System Evaluation Track: Dimensionality, Language, Culture and Safety at DSTC 12](https://aclanthology.org/2025.dstc-1.3/) (Mendonça et al., DSTC 2025)
ACL
- John Mendonça, Lining Zhang, Rahul Mallidi, Alon Lavie, Isabel Trancoso, Luis Fernando D’Haro, and João Sedoc. 2025. Overview of Dialog System Evaluation Track: Dimensionality, Language, Culture and Safety at DSTC 12. In Proceedings of the Twelfth Dialog System Technology Challenge, pages 27–35, Avignon, France. Association for Computational Linguistics.