@inproceedings{hu-etal-2024-reranking,
title = "Reranking Overgenerated Responses for End-to-End Task-Oriented Dialogue Systems",
author = "Hu, Songbo and
Vuli{\'c}, Ivan and
Liu, Fangyu and
Korhonen, Anna",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1219",
pages = "13970--13991",
abstract = "End-to-end task-oriented dialogue systems are prone to fall into the so-called {`}likelihood trap{'}, resulting in generated responses which are dull, repetitive, and often inconsistent with dialogue history. Comparing ranked lists of multiple generated responses against the {`}gold response{'} reveals a wide diversity in quality, with many good responses placed lower in the ranked list. The main challenge addressed in this work is how to reach beyond greedily generated system responses, that is, how to obtain and select high-quality responses from the list of overgenerated responses at inference without the availability of the gold response. To this end, we propose a simple yet effective reranking method to select high-quality items from the lists of initially overgenerated responses. The idea is to use any sequence-level scoring function to divide the semantic space of responses into high-scoring versus low-scoring partitions. At training, the high-scoring partition comprises all generated responses whose similarity to the gold response is higher than the similarity of the greedy response to the gold response. At inference, the aim is to estimate the probability that each overgenerated response belongs to the high-scoring partition. We evaluate our proposed method on the standard MultiWOZ dataset, the BiTOD dataset, and with human evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2024-reranking">
<titleInfo>
<title>Reranking Overgenerated Responses for End-to-End Task-Oriented Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Songbo</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Vulić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fangyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>End-to-end task-oriented dialogue systems are prone to fall into the so-called ‘likelihood trap’, resulting in generated responses which are dull, repetitive, and often inconsistent with dialogue history. Comparing ranked lists of multiple generated responses against the ‘gold response’ reveals a wide diversity in quality, with many good responses placed lower in the ranked list. The main challenge addressed in this work is how to reach beyond greedily generated system responses, that is, how to obtain and select high-quality responses from the list of overgenerated responses at inference without the availability of the gold response. To this end, we propose a simple yet effective reranking method to select high-quality items from the lists of initially overgenerated responses. The idea is to use any sequence-level scoring function to divide the semantic space of responses into high-scoring versus low-scoring partitions. At training, the high-scoring partition comprises all generated responses whose similarity to the gold response is higher than the similarity of the greedy response to the gold response. At inference, the aim is to estimate the probability that each overgenerated response belongs to the high-scoring partition. We evaluate our proposed method on the standard MultiWOZ dataset, the BiTOD dataset, and with human evaluation.</abstract>
<identifier type="citekey">hu-etal-2024-reranking</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1219</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>13970</start>
<end>13991</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reranking Overgenerated Responses for End-to-End Task-Oriented Dialogue Systems
%A Hu, Songbo
%A Vulić, Ivan
%A Liu, Fangyu
%A Korhonen, Anna
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F hu-etal-2024-reranking
%X End-to-end task-oriented dialogue systems are prone to fall into the so-called ‘likelihood trap’, resulting in generated responses which are dull, repetitive, and often inconsistent with dialogue history. Comparing ranked lists of multiple generated responses against the ‘gold response’ reveals a wide diversity in quality, with many good responses placed lower in the ranked list. The main challenge addressed in this work is how to reach beyond greedily generated system responses, that is, how to obtain and select high-quality responses from the list of overgenerated responses at inference without the availability of the gold response. To this end, we propose a simple yet effective reranking method to select high-quality items from the lists of initially overgenerated responses. The idea is to use any sequence-level scoring function to divide the semantic space of responses into high-scoring versus low-scoring partitions. At training, the high-scoring partition comprises all generated responses whose similarity to the gold response is higher than the similarity of the greedy response to the gold response. At inference, the aim is to estimate the probability that each overgenerated response belongs to the high-scoring partition. We evaluate our proposed method on the standard MultiWOZ dataset, the BiTOD dataset, and with human evaluation.
%U https://aclanthology.org/2024.lrec-main.1219
%P 13970-13991
Markdown (Informal)
[Reranking Overgenerated Responses for End-to-End Task-Oriented Dialogue Systems](https://aclanthology.org/2024.lrec-main.1219) (Hu et al., LREC-COLING 2024)
ACL