@inproceedings{georgila-2024-comparing,
title = "Comparing Pre-Trained Embeddings and Domain-Independent Features for Regression-Based Evaluation of Task-Oriented Dialogue Systems",
author = "Georgila, Kallirroi",
editor = "Kawahara, Tatsuya and
Demberg, Vera and
Ultes, Stefan and
Inoue, Koji and
Mehri, Shikib and
Howcroft, David and
Komatani, Kazunori",
booktitle = "Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2024",
address = "Kyoto, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigdial-1.52",
doi = "10.18653/v1/2024.sigdial-1.52",
pages = "610--623",
abstract = "We use Gaussian Process Regression to predict different types of ratings provided by users after interacting with various task-oriented dialogue systems. We compare the performance of domain-independent dialogue features (e.g., duration, number of filled slots, number of confirmed slots, word error rate) with pre-trained dialogue embeddings. These pre-trained dialogue embeddings are computed by averaging over sentence embeddings in a dialogue. Sentence embeddings are created using various models based on sentence transformers (appearing on the Hugging Face Massive Text Embedding Benchmark leaderboard) or by averaging over BERT word embeddings (varying the BERT layers used). We also compare pre-trained embeddings extracted from human transcriptions with pre-trained embeddings extracted from speech recognition outputs, to determine the robustness of these models to errors. Our results show that overall, for most types of user satisfaction ratings and advanced/recent (or sometimes less advanced/recent) pre-trained embedding models, using only pre-trained embeddings outperforms using only domain-independent features. However, this pattern varies depending on the type of rating and the embedding model used. Also, pre-trained embeddings are found to be robust to speech recognition errors, more advanced/recent embedding models do not always perform better than less advanced/recent ones, and larger models do not necessarily outperform smaller ones. The best prediction performance is achieved by combining pre-trained embeddings with domain-independent features.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="georgila-2024-comparing">
<titleInfo>
<title>Comparing Pre-Trained Embeddings and Domain-Independent Features for Regression-Based Evaluation of Task-Oriented Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kallirroi</namePart>
<namePart type="family">Georgila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Ultes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikib</namePart>
<namePart type="family">Mehri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazunori</namePart>
<namePart type="family">Komatani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyoto, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We use Gaussian Process Regression to predict different types of ratings provided by users after interacting with various task-oriented dialogue systems. We compare the performance of domain-independent dialogue features (e.g., duration, number of filled slots, number of confirmed slots, word error rate) with pre-trained dialogue embeddings. These pre-trained dialogue embeddings are computed by averaging over sentence embeddings in a dialogue. Sentence embeddings are created using various models based on sentence transformers (appearing on the Hugging Face Massive Text Embedding Benchmark leaderboard) or by averaging over BERT word embeddings (varying the BERT layers used). We also compare pre-trained embeddings extracted from human transcriptions with pre-trained embeddings extracted from speech recognition outputs, to determine the robustness of these models to errors. Our results show that overall, for most types of user satisfaction ratings and advanced/recent (or sometimes less advanced/recent) pre-trained embedding models, using only pre-trained embeddings outperforms using only domain-independent features. However, this pattern varies depending on the type of rating and the embedding model used. Also, pre-trained embeddings are found to be robust to speech recognition errors, more advanced/recent embedding models do not always perform better than less advanced/recent ones, and larger models do not necessarily outperform smaller ones. The best prediction performance is achieved by combining pre-trained embeddings with domain-independent features.</abstract>
<identifier type="citekey">georgila-2024-comparing</identifier>
<identifier type="doi">10.18653/v1/2024.sigdial-1.52</identifier>
<location>
<url>https://aclanthology.org/2024.sigdial-1.52</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>610</start>
<end>623</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing Pre-Trained Embeddings and Domain-Independent Features for Regression-Based Evaluation of Task-Oriented Dialogue Systems
%A Georgila, Kallirroi
%Y Kawahara, Tatsuya
%Y Demberg, Vera
%Y Ultes, Stefan
%Y Inoue, Koji
%Y Mehri, Shikib
%Y Howcroft, David
%Y Komatani, Kazunori
%S Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2024
%8 September
%I Association for Computational Linguistics
%C Kyoto, Japan
%F georgila-2024-comparing
%X We use Gaussian Process Regression to predict different types of ratings provided by users after interacting with various task-oriented dialogue systems. We compare the performance of domain-independent dialogue features (e.g., duration, number of filled slots, number of confirmed slots, word error rate) with pre-trained dialogue embeddings. These pre-trained dialogue embeddings are computed by averaging over sentence embeddings in a dialogue. Sentence embeddings are created using various models based on sentence transformers (appearing on the Hugging Face Massive Text Embedding Benchmark leaderboard) or by averaging over BERT word embeddings (varying the BERT layers used). We also compare pre-trained embeddings extracted from human transcriptions with pre-trained embeddings extracted from speech recognition outputs, to determine the robustness of these models to errors. Our results show that overall, for most types of user satisfaction ratings and advanced/recent (or sometimes less advanced/recent) pre-trained embedding models, using only pre-trained embeddings outperforms using only domain-independent features. However, this pattern varies depending on the type of rating and the embedding model used. Also, pre-trained embeddings are found to be robust to speech recognition errors, more advanced/recent embedding models do not always perform better than less advanced/recent ones, and larger models do not necessarily outperform smaller ones. The best prediction performance is achieved by combining pre-trained embeddings with domain-independent features.
%R 10.18653/v1/2024.sigdial-1.52
%U https://aclanthology.org/2024.sigdial-1.52
%U https://doi.org/10.18653/v1/2024.sigdial-1.52
%P 610-623
Markdown (Informal)
[Comparing Pre-Trained Embeddings and Domain-Independent Features for Regression-Based Evaluation of Task-Oriented Dialogue Systems](https://aclanthology.org/2024.sigdial-1.52) (Georgila, SIGDIAL 2024)
ACL