@InProceedings{hori-watanabe-hershey:2017:Long,
  author    = {Hori, Takaaki  and  Watanabe, Shinji  and  Hershey, John},
  title     = {Joint CTC/attention decoding for end-to-end speech recognition},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {518--529},
  abstract  = {End-to-end automatic speech recognition (ASR) has become a popular alternative
	to conventional DNN/HMM systems because it avoids the need for linguistic
	resources such as pronunciation dictionary, tokenization, and
	context-dependency trees, leading to a greatly simplified model-building
	process. There are two major types of end-to-end architectures for ASR: 
	attention-based methods use an attention mechanism to perform alignment between
	acoustic frames and recognized symbols, and connectionist temporal
	classification (CTC), uses Markov assumptions to efficiently solve sequential
	problems by dynamic programming. This paper proposes joint decoding algorithm
	for end-to-end ASR with a hybrid CTC/attention architecture, which effectively
	utilizes both advantages in decoding. We have applied the proposed method to
	two ASR benchmarks (spontaneous Japanese and Mandarin Chinese), and showing the
	comparable performance to conventional state-of-the-art DNN/HMM ASR systems
	without linguistic resources.},
  url       = {http://aclweb.org/anthology/P17-1048}
}

