@InProceedings{chen-EtAl:2017:EMNLP20172,
  author    = {Chen, Lu  and  Zhou, Xiang  and  Chang, Cheng  and  Yang, Runzhe  and  Yu, Kai},
  title     = {Agent-Aware Dropout DQN for Safe and Efficient On-line Dialogue Policy Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2454--2464},
  abstract  = {Hand-crafted rules and reinforcement learning (RL) are two popular choices to
	obtain dialogue policy.  The rule-based policy is often reliable within
	predefined scope but not self-adaptable, whereas RL is evolvable with data but
	often suffers from a bad initial performance. We employ a {\em companion
	learning} framework to integrate the two approaches for {\em on-line} dialogue
	policy learning, in which a pre-defined rule-based policy acts as a
	“teacher” and guides a data-driven RL system by giving example actions as
	well as additional rewards. A novel {\em agent-aware dropout} Deep Q-Network
	(AAD-DQN) is proposed to address the problem of when to consult the teacher and
	how to learn from the teacher's experiences. AAD-DQN, as a data-driven student
	policy, provides (1) two separate experience memories for student and teacher,
	(2) an uncertainty estimated by dropout to control the timing of consultation
	and learning. Simulation experiments showed that the proposed approach can
	significantly improve both {\em safety} and {\em efficiency} of on-line policy
	optimization compared to other companion learning approaches as well as
	supervised pre-training using static dialogue corpus.},
  url       = {https://www.aclweb.org/anthology/D17-1260}
}

