@article{zhao-etal-2024-rescue,
title = "Rescue Conversations from Dead-ends: Efficient Exploration for Task-oriented Dialogue Policy Optimization",
author = "Zhao, Yangyang and
Dastani, Mehdi and
Long, Jinchuan and
Wang, Zhenyu and
Wang, Shihan",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.86/",
doi = "10.1162/tacl_a_00717",
pages = "1578--1596",
abstract = "Training a task-oriented dialogue policy using deep reinforcement learning is promising but requires extensive environment exploration. The amount of wasted invalid exploration makes policy learning inefficient. In this paper, we define and argue that dead-end states are important reasons for invalid exploration. When a conversation enters a dead-end state, regardless of the actions taken afterward, it will continue in a dead-end trajectory until the agent reaches a termination state or maximum turn. We propose a Dead-end Detection and Resurrection (DDR) method that detects dead-end states in an efficient manner and provides a rescue action to guide and correct the exploration direction. To prevent dialogue policies from repeating errors, DDR also performs dialogue data augmentation by adding relevant experiences that include dead-end states and penalties into the experience pool. We first validate the dead-end detection reliability and then demonstrate the effectiveness and generality of the method across various domains through experiments on four public dialogue datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2024-rescue">
<titleInfo>
<title>Rescue Conversations from Dead-ends: Efficient Exploration for Task-oriented Dialogue Policy Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yangyang</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Dastani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinchuan</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shihan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Training a task-oriented dialogue policy using deep reinforcement learning is promising but requires extensive environment exploration. The amount of wasted invalid exploration makes policy learning inefficient. In this paper, we define and argue that dead-end states are important reasons for invalid exploration. When a conversation enters a dead-end state, regardless of the actions taken afterward, it will continue in a dead-end trajectory until the agent reaches a termination state or maximum turn. We propose a Dead-end Detection and Resurrection (DDR) method that detects dead-end states in an efficient manner and provides a rescue action to guide and correct the exploration direction. To prevent dialogue policies from repeating errors, DDR also performs dialogue data augmentation by adding relevant experiences that include dead-end states and penalties into the experience pool. We first validate the dead-end detection reliability and then demonstrate the effectiveness and generality of the method across various domains through experiments on four public dialogue datasets.</abstract>
<identifier type="citekey">zhao-etal-2024-rescue</identifier>
<identifier type="doi">10.1162/tacl_a_00717</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.86/</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>1578</start>
<end>1596</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Rescue Conversations from Dead-ends: Efficient Exploration for Task-oriented Dialogue Policy Optimization
%A Zhao, Yangyang
%A Dastani, Mehdi
%A Long, Jinchuan
%A Wang, Zhenyu
%A Wang, Shihan
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F zhao-etal-2024-rescue
%X Training a task-oriented dialogue policy using deep reinforcement learning is promising but requires extensive environment exploration. The amount of wasted invalid exploration makes policy learning inefficient. In this paper, we define and argue that dead-end states are important reasons for invalid exploration. When a conversation enters a dead-end state, regardless of the actions taken afterward, it will continue in a dead-end trajectory until the agent reaches a termination state or maximum turn. We propose a Dead-end Detection and Resurrection (DDR) method that detects dead-end states in an efficient manner and provides a rescue action to guide and correct the exploration direction. To prevent dialogue policies from repeating errors, DDR also performs dialogue data augmentation by adding relevant experiences that include dead-end states and penalties into the experience pool. We first validate the dead-end detection reliability and then demonstrate the effectiveness and generality of the method across various domains through experiments on four public dialogue datasets.
%R 10.1162/tacl_a_00717
%U https://aclanthology.org/2024.tacl-1.86/
%U https://doi.org/10.1162/tacl_a_00717
%P 1578-1596
Markdown (Informal)
[Rescue Conversations from Dead-ends: Efficient Exploration for Task-oriented Dialogue Policy Optimization](https://aclanthology.org/2024.tacl-1.86/) (Zhao et al., TACL 2024)
ACL