@inproceedings{conforto-lopez-etal-2026-automatic,
title = "Automatic Evaluation of Open-Domain Real Conversations: Combining Encoder-Based, Dialogue-Based Features and Large Language Models Ratings",
author = "Conforto L{\'o}pez, Cristina and
Estecha-Goritagoitia, Marcos and
Rodriguez-Cantelar, Mario and
Cordoba, Ricardo and
D{'}Haro, Luis Fernando",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.5/",
pages = "52--63",
abstract = "Conversational {AI} is a central application of {NLP}, yet ensuring high response quality remains challenging due to the inherently subjective nature of user satisfaction. Dialogue evaluation can be performed manually{---}through expert or user ratings{---}or automatically, using methods that aim to predict quality scores consistent with human judgment. In this work, we present a reference-free automatic dialogue evaluation system that predicts user ratings from a dataset of real human{--}chatbot interactions collected during the {A}lexa {P}rize Socialbot Grand Challenge 5, combining multiple complementary models to enhance correlation with human scores. Experimental results indicate that the model that achieves the highest {P}earson correlation with users' ratings is an {XGB}oost regression model that combines different features such as conversation length, engineered flags capturing conversation characteristics, predictions from an Encoder-based Panel of Experts ({P}o{E}), and instruction-based outputs from a fine-tuned {LLM}. The overall {P}earson Correlation on the eval set is 0.404, which is competitive with prior work trained on an order of magnitude more dialogues, albeit using different datasets and system configurations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="conforto-lopez-etal-2026-automatic">
<titleInfo>
<title>Automatic Evaluation of Open-Domain Real Conversations: Combining Encoder-Based, Dialogue-Based Features and Large Language Models Ratings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cristina</namePart>
<namePart type="family">Conforto López</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Estecha-Goritagoitia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Rodriguez-Cantelar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Cordoba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Conversational AI is a central application of NLP, yet ensuring high response quality remains challenging due to the inherently subjective nature of user satisfaction. Dialogue evaluation can be performed manually—through expert or user ratings—or automatically, using methods that aim to predict quality scores consistent with human judgment. In this work, we present a reference-free automatic dialogue evaluation system that predicts user ratings from a dataset of real human–chatbot interactions collected during the Alexa Prize Socialbot Grand Challenge 5, combining multiple complementary models to enhance correlation with human scores. Experimental results indicate that the model that achieves the highest Pearson correlation with users’ ratings is an XGBoost regression model that combines different features such as conversation length, engineered flags capturing conversation characteristics, predictions from an Encoder-based Panel of Experts (PoE), and instruction-based outputs from a fine-tuned LLM. The overall Pearson Correlation on the eval set is 0.404, which is competitive with prior work trained on an order of magnitude more dialogues, albeit using different datasets and system configurations.</abstract>
<identifier type="citekey">conforto-lopez-etal-2026-automatic</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.5/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>52</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Evaluation of Open-Domain Real Conversations: Combining Encoder-Based, Dialogue-Based Features and Large Language Models Ratings
%A Conforto López, Cristina
%A Estecha-Goritagoitia, Marcos
%A Rodriguez-Cantelar, Mario
%A Cordoba, Ricardo
%A D’Haro, Luis Fernando
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F conforto-lopez-etal-2026-automatic
%X Conversational AI is a central application of NLP, yet ensuring high response quality remains challenging due to the inherently subjective nature of user satisfaction. Dialogue evaluation can be performed manually—through expert or user ratings—or automatically, using methods that aim to predict quality scores consistent with human judgment. In this work, we present a reference-free automatic dialogue evaluation system that predicts user ratings from a dataset of real human–chatbot interactions collected during the Alexa Prize Socialbot Grand Challenge 5, combining multiple complementary models to enhance correlation with human scores. Experimental results indicate that the model that achieves the highest Pearson correlation with users’ ratings is an XGBoost regression model that combines different features such as conversation length, engineered flags capturing conversation characteristics, predictions from an Encoder-based Panel of Experts (PoE), and instruction-based outputs from a fine-tuned LLM. The overall Pearson Correlation on the eval set is 0.404, which is competitive with prior work trained on an order of magnitude more dialogues, albeit using different datasets and system configurations.
%U https://aclanthology.org/2026.iwsds-1.5/
%P 52-63
Markdown (Informal)
[Automatic Evaluation of Open-Domain Real Conversations: Combining Encoder-Based, Dialogue-Based Features and Large Language Models Ratings](https://aclanthology.org/2026.iwsds-1.5/) (Conforto López et al., IWSDS 2026)
ACL