@inproceedings{chi-etal-2022-offline,
title = "Offline-to-Online Co-Evolutional User Simulator and Dialogue System",
author = "Chi, Dafeng and
Zhuang, Yuzheng and
Mu, Yao and
Wang, Bin and
Bao, Jianzhu and
Wang, Yasheng and
Dong, Yuhan and
Jiang, Xin and
Liu, Qun and
Hao, Jianye",
editor = "Ou, Zhijian and
Feng, Junlan and
Li, Juanzi",
booktitle = "Proceedings of the Towards Semi-Supervised and Reinforced Task-Oriented Dialog Systems (SereTOD)",
month = dec,
year = "2022",
address = "Abu Dhabi, Beijing (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.seretod-1.11",
doi = "10.18653/v1/2022.seretod-1.11",
pages = "98--113",
abstract = "Reinforcement learning (RL) has emerged as a promising approach to fine-tune offline pretrained GPT-2 model in task-oriented dialogue (TOD) systems. In order to obtain human-like online interactions while extending the usage of RL, building pretrained user simulators (US) along with dialogue systems (DS) and facilitating jointly fine-tuning via RL becomes prevalent. However, joint training brings distributional shift problem caused by compounding exposure bias. Existing methods usually iterative update US and DS to ameliorate the ensued non-stationarity problem, which could lead to sub-optimal policy and less sample efficiency. To take a step further for tackling the problem, we introduce an Offline-to-oNline Co-Evolutional (ONCE) framework, which enables bias-aware concurrent joint update for RL-based fine-tuning whilst takes advantages from GPT-2 based end-to-end modeling on US and DS. Extensive experiments demonstrate that ONCE builds high-quality loops of policy learning and dialogues data collection, and achieves state-of-the-art online and offline evaluation results on MultiWOZ2.1 dataset. Opensourced code will be implemented with Mindspore (MS, 2022) and released on our homepage.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chi-etal-2022-offline">
<titleInfo>
<title>Offline-to-Online Co-Evolutional User Simulator and Dialogue System</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dafeng</namePart>
<namePart type="family">Chi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuzheng</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Mu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianzhu</namePart>
<namePart type="family">Bao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yasheng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhan</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianye</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Towards Semi-Supervised and Reinforced Task-Oriented Dialog Systems (SereTOD)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhijian</namePart>
<namePart type="family">Ou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junlan</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juanzi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, Beijing (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Reinforcement learning (RL) has emerged as a promising approach to fine-tune offline pretrained GPT-2 model in task-oriented dialogue (TOD) systems. In order to obtain human-like online interactions while extending the usage of RL, building pretrained user simulators (US) along with dialogue systems (DS) and facilitating jointly fine-tuning via RL becomes prevalent. However, joint training brings distributional shift problem caused by compounding exposure bias. Existing methods usually iterative update US and DS to ameliorate the ensued non-stationarity problem, which could lead to sub-optimal policy and less sample efficiency. To take a step further for tackling the problem, we introduce an Offline-to-oNline Co-Evolutional (ONCE) framework, which enables bias-aware concurrent joint update for RL-based fine-tuning whilst takes advantages from GPT-2 based end-to-end modeling on US and DS. Extensive experiments demonstrate that ONCE builds high-quality loops of policy learning and dialogues data collection, and achieves state-of-the-art online and offline evaluation results on MultiWOZ2.1 dataset. Opensourced code will be implemented with Mindspore (MS, 2022) and released on our homepage.</abstract>
<identifier type="citekey">chi-etal-2022-offline</identifier>
<identifier type="doi">10.18653/v1/2022.seretod-1.11</identifier>
<location>
<url>https://aclanthology.org/2022.seretod-1.11</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>98</start>
<end>113</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Offline-to-Online Co-Evolutional User Simulator and Dialogue System
%A Chi, Dafeng
%A Zhuang, Yuzheng
%A Mu, Yao
%A Wang, Bin
%A Bao, Jianzhu
%A Wang, Yasheng
%A Dong, Yuhan
%A Jiang, Xin
%A Liu, Qun
%A Hao, Jianye
%Y Ou, Zhijian
%Y Feng, Junlan
%Y Li, Juanzi
%S Proceedings of the Towards Semi-Supervised and Reinforced Task-Oriented Dialog Systems (SereTOD)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, Beijing (Hybrid)
%F chi-etal-2022-offline
%X Reinforcement learning (RL) has emerged as a promising approach to fine-tune offline pretrained GPT-2 model in task-oriented dialogue (TOD) systems. In order to obtain human-like online interactions while extending the usage of RL, building pretrained user simulators (US) along with dialogue systems (DS) and facilitating jointly fine-tuning via RL becomes prevalent. However, joint training brings distributional shift problem caused by compounding exposure bias. Existing methods usually iterative update US and DS to ameliorate the ensued non-stationarity problem, which could lead to sub-optimal policy and less sample efficiency. To take a step further for tackling the problem, we introduce an Offline-to-oNline Co-Evolutional (ONCE) framework, which enables bias-aware concurrent joint update for RL-based fine-tuning whilst takes advantages from GPT-2 based end-to-end modeling on US and DS. Extensive experiments demonstrate that ONCE builds high-quality loops of policy learning and dialogues data collection, and achieves state-of-the-art online and offline evaluation results on MultiWOZ2.1 dataset. Opensourced code will be implemented with Mindspore (MS, 2022) and released on our homepage.
%R 10.18653/v1/2022.seretod-1.11
%U https://aclanthology.org/2022.seretod-1.11
%U https://doi.org/10.18653/v1/2022.seretod-1.11
%P 98-113
Markdown (Informal)
[Offline-to-Online Co-Evolutional User Simulator and Dialogue System](https://aclanthology.org/2022.seretod-1.11) (Chi et al., SereTOD 2022)
ACL
- Dafeng Chi, Yuzheng Zhuang, Yao Mu, Bin Wang, Jianzhu Bao, Yasheng Wang, Yuhan Dong, Xin Jiang, Qun Liu, and Jianye Hao. 2022. Offline-to-Online Co-Evolutional User Simulator and Dialogue System. In Proceedings of the Towards Semi-Supervised and Reinforced Task-Oriented Dialog Systems (SereTOD), pages 98–113, Abu Dhabi, Beijing (Hybrid). Association for Computational Linguistics.