@Book{Long:2017,
  editor    = {Regina Barzilay  and  Min-Yen Kan},
  title     = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  url       = {http://aclweb.org/anthology/P17-1}
}

@InProceedings{liu-qiu-huang:2017:Long,
  author    = {Liu, Pengfei  and  Qiu, Xipeng  and  Huang, Xuanjing},
  title     = {Adversarial Multi-task Learning for Text Classification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1--10},
  abstract  = {Neural network models have shown their promising opportunities for multi-task
	learning, which focus on learning the shared layers to extract the common and
	task-invariant features. However, in most existing approaches, the extracted
	shared features are prone to be contaminated by task-specific features or the
	noise brought by other tasks.
	In this paper, we propose an adversarial multi-task learning framework,
	alleviating the shared and private latent feature spaces from interfering with
	each other.
	We conduct extensive experiments on 16 different text classification tasks,
	which demonstrates the benefits of our approach. Besides, we show that the
	shared knowledge learned by our proposed model can be regarded as off-the-shelf
	knowledge and easily transferred to new tasks.
	The datasets of all 16 tasks are publicly available at
	http://nlp.fudan.edu.cn/data/},
  url       = {http://aclweb.org/anthology/P17-1001}
}

@InProceedings{eger-daxenberger-gurevych:2017:Long,
  author    = {Eger, Steffen  and  Daxenberger, Johannes  and  Gurevych, Iryna},
  title     = {Neural End-to-End Learning for Computational Argumentation Mining},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {11--22},
  abstract  = {We investigate neural techniques for end-to-end computational argumentation
	mining (AM). We frame AM both as a token-based dependency parsing and as a
	token-based sequence tagging problem, including a multi-task learning setup.
	Contrary to models that operate on the argument component level, we find that
	framing AM as dependency parsing leads to subpar performance results. In
	contrast, less complex (local) tagging models based on BiLSTMs perform robustly
	across classification scenarios, being able to catch long-range dependencies
	inherent to the AM problem. Moreover, we find that jointly
	learning `natural' subtasks, in a multi-task learning setup, improves
	performance.},
  url       = {http://aclweb.org/anthology/P17-1002}
}

@InProceedings{liang-EtAl:2017:Long,
  author    = {Liang, Chen  and  Berant, Jonathan  and  Le, Quoc  and  Forbus, Kenneth D.  and  Lao, Ni},
  title     = {Neural Symbolic Machines: Learning Semantic Parsers on Freebase with Weak Supervision},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {23--33},
  abstract  = {Harnessing the statistical power of neural networks to perform language
	understanding and symbolic reasoning is difficult, when it requires executing
	efficient discrete operations against a large knowledge-base. In this work, we
	introduce a Neural Symbolic Machine, which contains (a) a neural "programmer",
	i.e., a sequence-to-sequence model that maps language utterances to programs
	and utilizes a key-variable memory to handle compositionality (b) a symbolic
	"computer", i.e., a Lisp interpreter that performs program execution, and helps
	find good programs by pruning the search space. We apply REINFORCE to directly
	optimize the task reward of this structured prediction problem. To train with
	weak supervision and improve the stability of REINFORCE, we augment it with an
	iterative maximum-likelihood training process. NSM outperforms the
	state-of-the-art on the WebQuestionsSP dataset when trained from
	question-answer pairs only, without requiring any feature engineering or
	domain-specific knowledge.},
  url       = {http://aclweb.org/anthology/P17-1003}
}

@InProceedings{lin-liu-sun:2017:Long,
  author    = {Lin, Yankai  and  Liu, Zhiyuan  and  Sun, Maosong},
  title     = {Neural Relation Extraction with Multi-lingual Attention},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {34--43},
  abstract  = {Relation extraction has been widely used for finding unknown relational facts
	from plain text. Most existing methods focus on exploiting mono-lingual data
	for relation extraction, ignoring massive information from the texts in various
	languages. To address this issue, we introduce a multi-lingual neural relation
	extraction framework, which employs mono-lingual attention to utilize the
	information within mono-lingual texts and further proposes cross-lingual
	attention to consider the information consistency and complementarity among
	cross-lingual texts. Experimental results on real-world datasets show that, our
	model can take advantage of multi-lingual texts and consistently achieve
	significant improvements on relation extraction as compared with baselines.},
  url       = {http://aclweb.org/anthology/P17-1004}
}

@InProceedings{cheng-EtAl:2017:Long,
  author    = {Cheng, Jianpeng  and  Reddy, Siva  and  Saraswat, Vijay  and  Lapata, Mirella},
  title     = {Learning Structured Natural Language Representations for Semantic Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {44--55},
  abstract  = {We introduce a neural semantic parser which is interpretable and scalable. Our
	model converts natural language utterances to intermediate, domain-general
	natural language representations in the form of predicate-argument structures,
	which are induced with a transition system and subsequently mapped to target
	domains. The semantic parser is trained end-to-end using annotated logical
	forms or their denotations. We achieve the state of the art on SPADES and
	GRAPHQUESTIONS and obtain competitive results on GEOQUERY and WEBQUESTIONS. The
	induced predicate-argument structures shed light on the types of
	representations useful for semantic parsing and how these are dif- ferent from
	linguistically motivated ones.},
  url       = {http://aclweb.org/anthology/P17-1005}
}

@InProceedings{vulic-EtAl:2017:Long,
  author    = {Vuli\'{c}, Ivan  and  Mrk\v{s}i\'{c}, Nikola  and  Reichart, Roi  and  \'{O} S\'{e}aghdha, Diarmuid  and  Young, Steve  and  Korhonen, Anna},
  title     = {Morph-fitting: Fine-Tuning Word Vector Spaces with Simple Language-Specific Rules},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {56--68},
  abstract  = {Morphologically rich languages accentuate two properties of distributional
	vector space models: 1) the difficulty of inducing accurate representations for
	low-frequency word forms; and 2) insensitivity to distinct lexical relations
	that have similar distributional signatures. These effects are detrimental for
	language understanding systems, which may infer that  'inexpensive' is a
	rephrasing for 'expensive' or may not associate 'acquire' with 'acquires'. In
	this work, we propose a novel morph-fitting procedure which moves past the use
	of curated semantic lexicons for improving distributional vector spaces.
	Instead, our method injects morphological constraints generated using simple
	language-specific rules, pulling inflectional forms of the same word close
	together and pushing derivational antonyms far apart. In intrinsic evaluation
	over four languages, we show that our approach: 1) improves low-frequency word
	estimates; and 2) boosts the semantic quality of the entire word vector
	collection. Finally, we show that morph-fitted vectors yield large gains in the
	downstream task of dialogue state tracking, highlighting the importance of
	morphology for tackling long-tail phenomena in language understanding tasks.},
  url       = {http://aclweb.org/anthology/P17-1006}
}

@InProceedings{gittens-achlioptas-mahoney:2017:Long,
  author    = {Gittens, Alex  and  Achlioptas, Dimitris  and  Mahoney, Michael W.},
  title     = {Skip-Gram - Zipf + Uniform = Vector Additivity},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {69--76},
  abstract  = {In recent years word-embedding models have gained great popularity due to their
	remarkable performance on several tasks, including word analogy questions and
	caption generation. An unexpected "side-effect" of such models is that their
	vectors often exhibit compositionality, i.e., adding two word-vectors
	results in a vector that is only a small angle away from the vector of a word
	representing the semantic composite of the original words, e.g., "man" +
	"royal" = "king".
	This work provides a theoretical justification for the presence of additive
	compositionality in word vectors learned using the Skip-Gram model. In
	particular, it shows that additive compositionality holds in an even stricter
	sense (small distance rather than small angle) under certain assumptions on the
	process generating the corpus. As a corollary, it explains the success of
	vector calculus in solving word analogies. When these assumptions do not hold,
	this work describes the correct non-linear composition operator. 
	Finally, this work establishes a connection between the Skip-Gram model and the
	Sufficient Dimensionality Reduction (SDR) framework of Globerson and Tishby:
	the parameters of SDR models can be obtained from those of Skip-Gram models
	simply by adding information on symbol frequencies. This shows that Skip-Gram
	embeddings are optimal in the sense of Globerson and Tishby and, further,
	implies that the heuristics commonly used to approximately fit Skip-Gram models
	can be used to fit SDR models.},
  url       = {http://aclweb.org/anthology/P17-1007}
}

@InProceedings{abend-rappoport:2017:Long,
  author    = {Abend, Omri  and  Rappoport, Ari},
  title     = {The State of the Art in Semantic Representation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {77--89},
  abstract  = {Semantic representation is receiving growing attention in NLP in the past few
	years, and many proposals for semantic schemes (e.g., AMR, UCCA, GMB, UDS) have
	been put forth. Yet, little has been done to assess the achievements and the
	shortcomings of these new contenders, compare them with syntactic schemes, and
	clarify the general goals of research on semantic representation. We address
	these gaps by critically surveying the state of the art in the field.},
  url       = {http://aclweb.org/anthology/P17-1008}
}

@InProceedings{lu-ng:2017:Long,
  author    = {Lu, Jing  and  Ng, Vincent},
  title     = {Joint Learning for Event Coreference Resolution},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {90--101},
  abstract  = {While joint models have been developed for many NLP tasks, the vast majority of
	event coreference resolvers, including the top-performing resolvers competing
	in the recent TAC KBP 2016 Event Nugget Detection and Coreference task, are
	pipeline-based, where the propagation of errors from the trigger detection
	component to the event coreference component is a major performance limiting
	factor. To address this problem, we propose a model for jointly learning event
	coreference, trigger detection, and event anaphoricity. Our joint model is
	novel in its choice of tasks and its features for capturing cross-task
	interactions. To our knowledge, this is the first attempt to train a
	mention-ranking model and employ event anaphoricity for event coreference. Our
	model achieves the best results to date on the KBP 2016 English and Chinese
	datasets.},
  url       = {http://aclweb.org/anthology/P17-1009}
}

@InProceedings{liu-EtAl:2017:Long1,
  author    = {Liu, Ting  and  Cui, Yiming  and  Yin, Qingyu  and  Zhang, Wei-Nan  and  Wang, Shijin  and  Hu, Guoping},
  title     = {Generating and Exploiting Large-scale Pseudo Training Data for Zero Pronoun Resolution},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {102--111},
  abstract  = {Most existing approaches for zero pronoun resolution are heavily relying on
	annotated data, which is often released by shared task organizers.
	Therefore, the lack of annotated data becomes a major obstacle in the progress
	of zero pronoun resolution task. 
	Also, it is expensive to spend manpower on labeling the data for better
	performance.
	To alleviate the problem above, in this paper, we propose a simple but novel
	approach to automatically generate large-scale pseudo training data for zero
	pronoun resolution.
	Furthermore, we successfully transfer the cloze-style reading comprehension
	neural network model into zero pronoun resolution task and propose a two-step
	training mechanism to overcome the gap between the pseudo training data and the
	real one.
	Experimental results show that the proposed approach significantly outperforms
	the state-of-the-art systems with an absolute improvements of 3.1% F-score on
	OntoNotes 5.0 data.},
  url       = {http://aclweb.org/anthology/P17-1010}
}

@InProceedings{song-EtAl:2017:Long,
  author    = {Song, Wei  and  Wang, Dong  and  Fu, Ruiji  and  Liu, Lizhen  and  Liu, Ting  and  Hu, Guoping},
  title     = {Discourse Mode Identification in Essays},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {112--122},
  abstract  = {Discourse modes play an important role in writing composition and evaluation.
	This paper presents a study on the manual and automatic identification of
	narration,exposition, description, argument and emotion expressing sentences in
	narrative essays. We annotate a corpus to study the characteristics of
	discourse modes
	and describe a neural sequence labeling model for identification. Evaluation
	results show that discourse modes can be identified automatically with an
	average F1-score of 0.7. We further demonstrate that discourse modes can be
	used as features that improve automatic essay scoring (AES). The impacts of
	discourse modes for AES are also discussed.},
  url       = {http://aclweb.org/anthology/P17-1011}
}

@InProceedings{gehring-EtAl:2017:Long,
  author    = {Gehring, Jonas  and  Auli, Michael  and  Grangier, David  and  Dauphin, Yann},
  title     = {A Convolutional Encoder Model for Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {123--135},
  abstract  = {The prevalent approach to neural machine translation relies on bi-directional
	LSTMs to encode the source sentence. We present a faster and simpler
	architecture based on a succession of convolutional layers. This allows to
	encode the source sentence simultaneously compared to recurrent networks for
	which computation is constrained by temporal dependencies. On WMT'16
	English-Romanian translation we achieve competitive accuracy to the
	state-of-the-art and on WMT'15 English-German we outperform several recently
	published results. Our models obtain almost the same accuracy as a very deep
	LSTM setup on WMT'14 English-French translation. We speed up CPU decoding by
	more than two times at the same or higher accuracy as a strong bi-directional
	LSTM.},
  url       = {http://aclweb.org/anthology/P17-1012}
}

@InProceedings{wang-EtAl:2017:Long1,
  author    = {Wang, Mingxuan  and  Lu, Zhengdong  and  Zhou, Jie  and  Liu, Qun},
  title     = {Deep Neural Machine Translation with Linear Associative Unit},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {136--145},
  abstract  = {Deep Neural Networks (DNNs) have provably enhanced the
	state-of-the-art Neural  Machine Translation (NMT) with
	 its capability in modeling complex functions and capturing
	  complex linguistic structures.
	  However NMT with deep architecture in its encoder or
	  decoder RNNs often suffer from severe gradient diffusion
	  due to the non-linear recurrent activations, which often
	  makes the optimization much more difficult.
	   To address this problem we propose a novel linear
	   associative units (LAU)  to reduce the gradient
	    propagation path inside the recurrent unit.
	    Different from conventional
	    approaches (LSTM unit and GRU),
	   LAUs uses linear associative connections
	   between input and
	   output of the recurrent unit,
	   which allows unimpeded information flow through both
	    space and time  The model is quite simple,
	     but it is surprisingly effective. Our empirical
	     study on Chinese-English translation shows that our
	     model with proper configuration can improve
	      by 11.7 BLEU upon Groundhog and the best
	      reported on results in the same setting.
	      On WMT14 English-German task and a larger WMT14
	       English-French task, our
	 model achieves comparable results with the state-of-the-art.},
  url       = {http://aclweb.org/anthology/P17-1013}
}

@InProceedings{konstas-EtAl:2017:Long,
  author    = {Konstas, Ioannis  and  Iyer, Srinivasan  and  Yatskar, Mark  and  Choi, Yejin  and  Zettlemoyer, Luke},
  title     = {Neural AMR: Sequence-to-Sequence Models for Parsing and Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {146--157},
  abstract  = {Sequence-to-sequence models have shown strong performance across a broad range
	of applications. However, their application to parsing and generating text
	using Abstract Meaning Representation (AMR) has been limited, due to the
	relatively limited amount of labeled data and the non-sequential nature of the
	AMR graphs. 
	We present a novel training procedure that can lift this limitation using
	millions of unlabeled sentences and careful preprocessing of the AMR graphs. 
	For AMR parsing, our model achieves competitive results of 62.1 SMATCH, the
	current best score reported without significant use of external semantic
	resources.
	For AMR generation, our model establishes a new state-of-the-art performance of
	BLEU 33.8. 
	We present extensive ablative and qualitative analysis including strong
	evidence that sequence-based AMR models are robust against ordering variations
	of graph-to-sequence conversions.},
  url       = {http://aclweb.org/anthology/P17-1014}
}

@InProceedings{ling-EtAl:2017:Long,
  author    = {Ling, Wang  and  Yogatama, Dani  and  Dyer, Chris  and  Blunsom, Phil},
  title     = {Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {158--167},
  abstract  = {Solving algebraic word problems requires executing a series of arithmetic
	operations---a program---to obtain a final answer. However, since programs can
	be arbitrarily complicated, inducing them directly from question-answer pairs
	is a formidable challenge. To make this task more feasible, we solve these
	problems by generating answer rationales, sequences of natural language and
	human-readable mathematical expressions that derive the final answer through a
	series of small steps. Although rationales do not explicitly specify programs,
	they provide a scaffolding for their structure via intermediate milestones. To
	evaluate our approach, we have created a new 100,000-sample dataset of
	questions, answers and rationales. Experimental results show that indirect
	supervision of program learning via answer rationales is a promising strategy
	for inducing arithmetic programs.},
  url       = {http://aclweb.org/anthology/P17-1015}
}

@InProceedings{hopkins-kiela:2017:Long,
  author    = {Hopkins, Jack  and  Kiela, Douwe},
  title     = {Automatically Generating Rhythmic Verse with Neural Networks},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {168--178},
  abstract  = {We propose two novel methodologies for the automatic generation of rhythmic
	poetry in a variety of forms. The first approach uses a neural language model
	trained on a phonetic encoding to learn an implicit representation of both the
	form and content of English poetry. This model can effectively learn common
	poetic devices such as rhyme, rhythm and alliteration. The second approach
	considers poetry generation as a constraint satisfaction problem where a
	generative neural language model is tasked with learning a representation of
	content, and a discriminative weighted finite state machine constrains it on
	the basis of form. By manipulating the constraints of the latter model, we can
	generate coherent poetry with arbitrary forms and themes. A large-scale
	extrinsic evaluation demonstrated that participants consider machine-generated
	poems to be written by humans 54% of the time. In addition, participants rated
	a machine-generated poem to be the best amongst all evaluated.},
  url       = {http://aclweb.org/anthology/P17-1016}
}

@InProceedings{gardent-EtAl:2017:Long,
  author    = {Gardent, Claire  and  Shimorina, Anastasia  and  Narayan, Shashi  and  Perez-Beltrachini, Laura},
  title     = {Creating Training Corpora for NLG Micro-Planners},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {179--188},
  abstract  = {In this paper, we present a novel framework for semi-automatically
	creating linguistically challenging micro-planning data-to-text
	corpora from existing Knowledge Bases. Because our method pairs data
	of varying size and shape with texts ranging from simple clauses to
	short texts, a dataset created using this framework provides a
	challenging benchmark for microplanning. Another feature of this
	framework is that it can be applied to any large scale knowledge base
	and can therefore be used to train and learn KB verbalisers.  We apply
	our framework to DBpedia data and compare the resulting dataset with
	Wen et al. 2016's. We show that while Wen et al.'s dataset is
	more than twice larger than ours, it is less diverse both in terms of
	input and in terms of text. We thus propose our corpus generation
	framework as a novel method for creating challenging data sets from
	which NLG models can be learned which are capable of handling the
	complex interactions occurring during in micro-planning between
	lexicalisation, aggregation, surface realisation, referring expression
	generation and sentence segmentation. To encourage researchers to take
	up this challenge, we made available a dataset of 21,855 data/text
	pairs created using this framework in the context of the
	WebNLG shared task.},
  url       = {http://aclweb.org/anthology/P17-1017}
}

@InProceedings{wang-EtAl:2017:Long2,
  author    = {Wang, Wenhui  and  Yang, Nan  and  Wei, Furu  and  Chang, Baobao  and  Zhou, Ming},
  title     = {Gated Self-Matching Networks for Reading Comprehension and Question Answering},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {189--198},
  abstract  = {In this paper, we present the gated self-matching networks for reading
	comprehension style question answering, which aims to answer questions from a
	given passage. We first match the question and passage with gated
	attention-based recurrent networks to obtain the question-aware passage
	representation. Then we propose a self-matching attention mechanism to refine
	the representation by matching the passage against itself, which effectively
	encodes information from the whole passage. We finally employ the pointer
	networks to locate the positions of answers from the passages. We conduct
	extensive experiments on the SQuAD dataset. The single model achieves 71.3% on
	the evaluation metrics of exact match on the hidden test set, while the
	ensemble model further boosts the results to 75.9%. At the time of submission
	of the paper, our model holds the first place on the SQuAD leaderboard for both
	single and ensemble model.},
  url       = {http://aclweb.org/anthology/P17-1018}
}

@InProceedings{he-EtAl:2017:Long1,
  author    = {He, Shizhu  and  Liu, Cao  and  Liu, Kang  and  Zhao, Jun},
  title     = {Generating Natural Answers by Incorporating Copying and Retrieving Mechanisms in Sequence-to-Sequence Learning},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {199--208},
  abstract  = {Generating answer with natural language sentence is very important in
	real-world question answering systems, which needs to obtain a right answer as
	well as a coherent natural response. In this paper, we propose an end-to-end
	question answering system called COREQA in sequence-to-sequence learning, which
	incorporates copying and retrieving mechanisms to generate natural answers
	within an encoder-decoder framework. Specifically, in COREQA, the semantic
	units (words, phrases and entities) in a natural answer are dynamically
	predicted from the vocabulary, copied from the given question and/or retrieved
	from the corresponding knowledge base jointly. Our empirical study on both
	synthetic and real-world datasets demonstrates the efficiency of COREQA, which
	is able to generate correct, coherent and natural answers for knowledge
	inquired questions.},
  url       = {http://aclweb.org/anthology/P17-1019}
}

@InProceedings{choi-EtAl:2017:Long,
  author    = {Choi, Eunsol  and  Hewlett, Daniel  and  Uszkoreit, Jakob  and  Polosukhin, Illia  and  Lacoste, Alexandre  and  Berant, Jonathan},
  title     = {Coarse-to-Fine Question Answering for Long Documents},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {209--220},
  abstract  = {We present a framework for question answering that can efficiently scale to
	longer documents while maintaining or even improving performance of
	state-of-the-art models. While most successful approaches for reading
	comprehension rely on recurrent neural networks (RNNs), running them over long
	documents is prohibitively slow because it is difficult to parallelize over
	sequences. Inspired by how people first skim the document, identify relevant
	parts, and carefully read these parts to produce an answer, we combine a
	coarse, fast model for selecting relevant sentences and a more expensive RNN
	for producing the answer from those sentences.
	We treat sentence selection as a latent variable trained jointly from the
	answer only using reinforcement learning. Experiments demonstrate
	state-of-the-art performance on a challenging subset of the WikiReading dataset
	and on a new dataset, while speeding up the model by 3.5x-6.7x.},
  url       = {http://aclweb.org/anthology/P17-1020}
}

@InProceedings{hao-EtAl:2017:Long,
  author    = {Hao, Yanchao  and  Zhang, Yuanzhe  and  Liu, Kang  and  He, Shizhu  and  Liu, Zhanyi  and  Wu, Hua  and  Zhao, Jun},
  title     = {An End-to-End Model for Question Answering over Knowledge Base with Cross-Attention Combining Global Knowledge},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {221--231},
  abstract  = {With the rapid growth of knowledge bases (KBs) on the web, how to take full
	advantage of them becomes increasingly important. Question answering over
	knowledge base (KB-QA) is one of the  promising approaches to access the
	substantial knowledge. Meanwhile, as the neural network-based (NN-based)
	methods develop, NN-based KB-QA has already achieved impressive results.
	However, previous work did not put more emphasis on question representation,
	and the question is converted into a fixed vector regardless of its candidate
	answers. This simple representation strategy is not easy to express the proper
	information in the question. Hence, we present an end-to-end neural network
	model to represent the questions and their corresponding scores dynamically
	according to the various candidate answer aspects via cross-attention
	mechanism. In addition, we leverage the global knowledge inside the underlying
	KB, aiming at integrating the rich KB information into the representation of
	the answers. As a result, it could alleviates the out-of-vocabulary (OOV)
	problem, which helps the cross-attention model to represent the question more
	precisely. The experimental results on WebQuestions demonstrate the
	effectiveness of the proposed approach.},
  url       = {http://aclweb.org/anthology/P17-1021}
}

@InProceedings{andreas-dragan-klein:2017:Long,
  author    = {Andreas, Jacob  and  Dragan, Anca  and  Klein, Dan},
  title     = {Translating Neuralese},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {232--242},
  abstract  = {Several approaches have recently been proposed for learning decentralized deep
	multiagent policies that coordinate via a differentiable communication channel.
	While these policies are effective for many tasks, interpretation of their
	induced communication strategies has remained a challenge. Here we propose to
	interpret agents' messages by translating them.  Unlike in typical machine
	translation problems, we have no parallel data to learn from. Instead we
	develop
	a translation model based on the insight that agent messages and natural
	language strings mean the same thing if they induce the same belief about
	the world in a listener.  We present theoretical guarantees and empirical
	evidence that our approach preserves both the semantics and pragmatics of
	messages by ensuring that players communicating through a translation layer do
	not suffer a substantial loss in reward relative to players with a common
	language.},
  url       = {http://aclweb.org/anthology/P17-1022}
}

@InProceedings{zarriess-schlangen:2017:Long,
  author    = {Zarrie{\ss}, Sina  and  Schlangen, David},
  title     = {Obtaining referential word meanings from visual and distributional information: Experiments on object naming},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {243--254},
  abstract  = {We investigate object naming, which is an important sub-task of referring
	expression generation on real-world images. As opposed to mutually exclusive
	labels used in object recognition, object names are more flexible, subject to
	communicative preferences and semantically related to each other. Therefore, we
	investigate models of referential word meaning that link visual to lexical
	information which we assume to be given through distributional word embeddings.
	We present a model that learns individual predictors for object names that link
	visual and distributional aspects of word meaning during training. We show that
	this is particularly beneficial for zero-shot learning, as compared to
	projecting visual objects directly into the distributional space. In a standard
	object naming task, we find that different ways of combining lexical and visual
	information achieve very similar performance, though experiments on model
	combination suggest that they capture complementary aspects of referential
	meaning.},
  url       = {http://aclweb.org/anthology/P17-1023}
}

@InProceedings{shekhar-EtAl:2017:Long,
  author    = {Shekhar, Ravi  and  Pezzelle, Sandro  and  Klimovich, Yauhen  and  Herbelot, Aur\'{e}lie  and  Nabi, Moin  and  Sangineto, Enver  and  Bernardi, Raffaella},
  title     = {FOIL it! Find One mismatch between Image and Language caption},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {255--265},
  abstract  = {In this paper, we aim to understand whether current language and vision (LaVi)
	models truly grasp the interaction between the two modalities. To this end, we
	propose an extension of the MS-COCO dataset, FOIL-COCO, which associates images
	with both correct and `foil' captions, that is, descriptions of the image that
	are highly similar to the original ones, but contain one single mistake (`foil
	word'). We show that current LaVi models fall into the traps of this data and
	perform badly on three tasks: a) caption  classification (correct vs. foil); b)
	foil word detection; c) foil word correction. Humans, in contrast, have
	near-perfect performance on those tasks. We demonstrate that merely utilising
	language cues is not enough to model FOIL-COCO and that it challenges the
	state-of-the-art by requiring a fine-grained understanding of the relation
	between text and image.},
  url       = {http://aclweb.org/anthology/P17-1024}
}

@InProceedings{forbes-choi:2017:Long,
  author    = {Forbes, Maxwell  and  Choi, Yejin},
  title     = {Verb Physics: Relative Physical Knowledge of Actions and Objects},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {266--276},
  abstract  = {Learning commonsense knowledge from natural language text is nontrivial due to
	reporting bias: people rarely state the obvious, e.g., “My house is bigger
	than me.” However, while rarely stated explicitly, this trivial everyday
	knowledge does influence the way people talk about the world, which provides
	indirect clues to reason about the world. For example, a statement like,
	“Tyler entered his house” implies that his house is bigger than Tyler.
	In this paper, we present an approach to infer relative physical knowledge of
	actions and objects along five dimensions (e.g., size, weight, and strength)
	from unstructured natural language text. We frame knowledge acquisition as
	joint inference over two closely related problems: learning (1) relative
	physical knowledge of object pairs and (2) physical implications of actions
	when applied to those object pairs. Empirical results demonstrate that it is
	possible to extract knowledge of actions and objects from language and that
	joint inference over different types of knowledge improves performance.},
  url       = {http://aclweb.org/anthology/P17-1025}
}

@InProceedings{yoshikawa-noji-matsumoto:2017:Long,
  author    = {Yoshikawa, Masashi  and  Noji, Hiroshi  and  Matsumoto, Yuji},
  title     = {A* CCG Parsing with a Supertag and Dependency Factored Model},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {277--287},
  abstract  = {We propose a new A* CCG parsing model in which the probability of a tree is
	decomposed into factors of CCG categories and its syntactic dependencies both
	defined on bi-directional LSTMs. Our factored model allows the precomputation
	of all probabilities and runs very efficiently, while modeling sentence
	structures explicitly via dependencies. Our model achieves the state-of-the-art
	results on English and Japanese CCG parsing.},
  url       = {http://aclweb.org/anthology/P17-1026}
}

@InProceedings{fernandezgonzalez-gomezrodriguez:2017:Long,
  author    = {Fern\'{a}ndez-Gonz\'{a}lez, Daniel  and  G\'{o}mez-Rodr\'{i}guez, Carlos},
  title     = {A Full Non-Monotonic Transition System for Unrestricted Non-Projective Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {288--298},
  abstract  = {Restricted non-monotonicity has been shown beneficial for the projective
	arc-eager dependency parser in previous research, as posterior decisions can
	repair mistakes made in previous states due to the lack of information. In this
	paper, we propose a novel, fully non-monotonic transition system based on the
	non-projective Covington algorithm. As a non-monotonic system requires
	exploration of erroneous actions during the training process, we develop
	several non-monotonic variants of the recently defined dynamic oracle for the
	Covington parser, based on tight approximations of the loss. Experiments on
	datasets from the CoNLL-X and CoNLL-XI shared tasks show that a non-monotonic
	dynamic oracle outperforms the monotonic version in the majority of languages.},
  url       = {http://aclweb.org/anthology/P17-1027}
}

@InProceedings{nguyen-EtAl:2017:Long,
  author    = {Nguyen, An Thanh  and  Wallace, Byron  and  Li, Junyi Jessy  and  Nenkova, Ani  and  Lease, Matthew},
  title     = {Aggregating and Predicting Sequence Labels from Crowd Annotations},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {299--309},
  abstract  = {Despite sequences being core to NLP, scant work has considered how to handle
	noisy sequence labels from multiple annotators for the same text. Given such
	annotations, we consider two complementary tasks:  (1) aggregating sequential
	crowd labels to infer a best single set of consensus annotations; and (2) using
	crowd annotations as training data for a model that can predict sequences in
	unannotated text. For aggregation, we propose a novel Hidden Markov Model
	variant. To predict sequences in unannotated text, we propose a neural approach
	using Long Short Term Memory. We evaluate a suite of methods across two
	different applications and text genres: Named-Entity Recognition in news
	articles and Information Extraction from biomedical abstracts. Results show
	improvement over strong baselines. Our source code and data are available
	online.},
  url       = {http://aclweb.org/anthology/P17-1028}
}

@InProceedings{zhou-neubig:2017:Long,
  author    = {Zhou, Chunting  and  Neubig, Graham},
  title     = {Multi-space Variational Encoder-Decoders for Semi-supervised Labeled Sequence Transduction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {310--320},
  abstract  = {Labeled sequence transduction is a task of transforming one sequence into
	another sequence that satisfies desiderata specified by a set of labels. In
	this paper we propose multi-space variational encoder-decoders, a new model for
	labeled sequence transduction with semi-supervised learning. The generative
	model can use neural networks to handle both discrete and continuous latent
	variables to exploit various features of data. Experiments show that our model
	provides not only a powerful supervised framework but also can effectively take
	advantage of the unlabeled data. On the SIGMORPHON morphological inflection
	benchmark, our model outperforms single-model state-of-art results by a large
	margin for the majority of languages.},
  url       = {http://aclweb.org/anthology/P17-1029}
}

@InProceedings{gan-EtAl:2017:Long,
  author    = {Gan, Zhe  and  Li, Chunyuan  and  Chen, Changyou  and  Pu, Yunchen  and  Su, Qinliang  and  Carin, Lawrence},
  title     = {Scalable Bayesian Learning of Recurrent Neural Networks for Language Modeling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {321--331},
  abstract  = {Recurrent neural networks (RNNs) have shown promising performance for language
	modeling. However, traditional training of RNNs using back-propagation through
	time often suffers from overfitting. One reason for this is that stochastic
	optimization (used for large training sets) does not provide good estimates of
	model uncertainty. This paper leverages recent advances in stochastic gradient
	Markov Chain Monte Carlo (also appropriate for large training sets) to learn
	weight uncertainty in RNNs. It yields a principled Bayesian learning algorithm,
	adding gradient noise during training (enhancing exploration of the
	model-parameter space) and model averaging when testing. Extensive experiments
	on various RNN models and across a broad range of applications demonstrate the
	superiority of the proposed approach relative to stochastic optimization.},
  url       = {http://aclweb.org/anthology/P17-1030}
}

@InProceedings{bollmann-bingel-sogaard:2017:Long,
  author    = {Bollmann, Marcel  and  Bingel, Joachim  and  S{\o}gaard, Anders},
  title     = {Learning attention for historical text normalization by learning to pronounce},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {332--344},
  abstract  = {Automated processing of historical texts often relies on pre-normalization to
	modern word forms. Training encoder-decoder architectures to solve such
	problems typically requires a lot of training data, which is not available for
	the named task. We address this problem by using several novel encoder-decoder
	architectures, including a multi-task learning (MTL) architecture using a
	grapheme-to-phoneme dictionary as auxiliary data, pushing the state-of-the-art
	by an absolute 2% increase in performance. We analyze the induced models across
	44 different texts from Early New High German. Interestingly, we observe that,
	as previously conjectured, multi-task learning can learn to focus attention
	during decoding, in ways remarkably similar to recently proposed attention
	mechanisms. This, we believe, is an important step toward understanding how MTL
	works.},
  url       = {http://aclweb.org/anthology/P17-1031}
}

@InProceedings{croce-EtAl:2017:Long,
  author    = {Croce, Danilo  and  Filice, Simone  and  Castellucci, Giuseppe  and  Basili, Roberto},
  title     = {Deep Learning in Semantic Kernel Spaces},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {345--354},
  abstract  = {Kernel methods enable the direct usage of structured representations of textual
	data during language learning and inference tasks. Expressive kernels, such as
	Tree Kernels, achieve excellent performance in NLP. 
	On the other side, deep neural networks have been demonstrated effective in
	automatically learning feature representations during training. However, their
	input is tensor data, i.e., they can not manage rich structured information.
	In this paper, we show that expressive kernels and deep neural networks can be
	combined in a common framework in order to (i) explicitly model structured
	information and (ii) learn non-linear decision functions. We show that the
	input layer of a deep architecture can be pre-trained through the application
	of the Nystrom low-rank approximation of kernel spaces. 
	The resulting ``kernelized" neural network achieves state-of-the-art accuracy
	in three different tasks.},
  url       = {http://aclweb.org/anthology/P17-1032}
}

@InProceedings{lau-baldwin-cohn:2017:Long,
  author    = {Lau, Jey Han  and  Baldwin, Timothy  and  Cohn, Trevor},
  title     = {Topically Driven Neural Language Model},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {355--365},
  abstract  = {Language models are typically applied at the sentence level, without
	  access to the broader document context.  We present a neural language
	  model that incorporates document context in the form of a topic
	  model-like architecture, thus providing a succinct representation of the
	  broader document context outside of the current sentence.  Experiments
	  over a range of datasets demonstrate that our model outperforms a pure
	  sentence-based model in terms of language model perplexity, and leads
	  to topics that are potentially more coherent than those produced by a
	  standard LDA topic model.  Our model also has the ability to generate
	  related sentences for a topic, providing another way to interpret topics.},
  url       = {http://aclweb.org/anthology/P17-1033}
}

@InProceedings{wang-liu-zhao:2017:Long,
  author    = {Wang, Xuepeng  and  Liu, Kang  and  Zhao, Jun},
  title     = {Handling Cold-Start Problem in Review Spam Detection by Jointly Embedding Texts and Behaviors},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {366--376},
  abstract  = {Solving cold-start problem in review spam detection is an urgent and
	significant task.
	It can help the on-line review websites to relieve the damage of spammers in
	time, but has never been investigated by previous work.
	This paper proposes a novel neural network model to detect review spam for
	cold-start problem, by learning to represent the new reviewers' review with
	jointly embedded textual and behavioral information.
	Experimental results prove the proposed model achieves an effective performance
	and possesses preferable domain-adaptability.
	It is also applicable to a large scale dataset in an unsupervised way.},
  url       = {http://aclweb.org/anthology/P17-1034}
}

@InProceedings{mishra-dey-bhattacharyya:2017:Long,
  author    = {Mishra, Abhijit  and  Dey, Kuntal  and  Bhattacharyya, Pushpak},
  title     = {Learning Cognitive Features from Gaze Data for Sentiment and Sarcasm Classification using Convolutional Neural Network},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {377--387},
  abstract  = {Cognitive NLP systems- i.e., NLP systems that make use of behavioral data -
	augment traditional text-based features with cognitive features extracted from
	eye-movement patterns, EEG signals, brain-imaging etc. Such extraction of
	features is typically manual. We contend that manual extraction of features may
	not be the best way to tackle text subtleties that characteristically prevail
	in complex classification tasks like Sentiment Analysis and Sarcasm Detection,
	and that even the extraction and choice of features should be delegated to the
	learning system.  We introduce a framework to automatically extract cognitive
	features from the eye-movement/gaze data of human readers reading the text and
	use them as features along with textual features for the tasks of sentiment
	polarity and sarcasm detection. Our proposed framework is based on
	Convolutional Neural Network (CNN). The CNN learns features from both gaze and
	text and uses them to classify the input text. We test our technique on
	published sentiment and sarcasm labeled datasets, enriched with gaze
	information, to show that using a combination of automatically learned text and
	gaze features often yields better classification performance over (i)  CNN
	based systems that rely on text input alone and (ii) existing systems that rely
	on handcrafted gaze and textual features.},
  url       = {http://aclweb.org/anthology/P17-1035}
}

@InProceedings{he-EtAl:2017:Long2,
  author    = {He, Ruidan  and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel},
  title     = {An Unsupervised Neural Attention Model for Aspect Extraction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {388--397},
  abstract  = {Aspect extraction is an important and challenging task in aspect-based
	sentiment analysis. Existing works tend to apply variants of topic models on
	this task. While fairly successful, these methods usually do not produce highly
	coherent aspects. In this paper, we present a novel neural approach with the
	aim of discovering coherent aspects. The model improves coherence by exploiting
	the distribution of word co-occurrences through the use of neural word
	embeddings. Unlike topic models which typically assume independently generated
	words, word embedding models encourage words that appear in similar contexts to
	be located close to each other in the embedding space. In addition, we use an
	attention mechanism to de-emphasize irrelevant words during training, further
	improving the coherence of aspects. Experimental results on real-life datasets
	demonstrate that our approach discovers more meaningful and coherent aspects,
	and substantially outperforms baseline methods on several evaluation tasks.},
  url       = {http://aclweb.org/anthology/P17-1036}
}

@InProceedings{sasaki-EtAl:2017:Long,
  author    = {Sasaki, Akira  and  Hanawa, Kazuaki  and  Okazaki, Naoaki  and  Inui, Kentaro},
  title     = {Other Topics You May Also Agree or Disagree: Modeling Inter-Topic Preferences using Tweets and Matrix Factorization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {398--408},
  abstract  = {We presents in this paper our approach for modeling inter-topic preferences of
	Twitter
	users: for example, "those who agree with the Trans-Pacific Partnership (TPP)
	also agree
	with free trade". This kind of knowledge is useful not only for stance
	detection across multiple topics but also for various real-world applications
	including public opinion survey,
	electoral prediction, electoral campaigns, and online debates. In order to
	extract
	users' preferences on Twitter, we design linguistic patterns in which people
	agree
	and disagree about specific topics (e.g., "A is completely wrong'').
	By applying these linguistic patterns to a collection of tweets, we extract
	statements agreeing and disagreeing with various topics. Inspired by previous
	work on
	item recommendation, we formalize the task of modeling inter-topic preferences
	as matrix factorization: representing users' preference as a user-topic matrix
	and mapping both users and topics onto a latent feature space that abstracts
	the preferences. Our experimental results demonstrate both that our presented
	approach is useful in predicting missing preferences of users and that the
	latent vector representations of topics successfully encode inter-topic
	preferences.},
  url       = {http://aclweb.org/anthology/P17-1037}
}

@InProceedings{chen-EtAl:2017:Long1,
  author    = {Chen, Yubo  and  Liu, Shulin  and  Zhang, Xiang  and  Liu, Kang  and  Zhao, Jun},
  title     = {Automatically Labeled Data Generation for Large Scale Event Extraction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {409--419},
  abstract  = {Modern models of event extraction for tasks like ACE are based on supervised
	learning of events from small hand-labeled data. However, hand-labeled training
	data is expensive to produce, in low coverage of event types, and limited in
	size, which makes supervised methods hard to extract large scale of events for
	knowledge base population. To solve the data labeling problem, we propose to
	automatically label training data for event extraction via world knowledge and
	linguistic knowledge, which can detect key arguments and trigger words for each
	event type and employ them to label events in texts automatically. The
	experimental results show that the quality of our large scale automatically
	labeled data is competitive with elaborately human-labeled data. And our
	automatically labeled data can incorporate with human-labeled data, then
	improve the performance of models learned from these data.},
  url       = {http://aclweb.org/anthology/P17-1038}
}

@InProceedings{zhong-sun-cambria:2017:Long,
  author    = {Zhong, Xiaoshi  and  Sun, Aixin  and  Cambria, Erik},
  title     = {Time Expression Analysis and Recognition Using Syntactic Token Types and General Heuristic Rules},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {420--429},
  abstract  = {Extracting time expressions from free text is a fundamental task for many
	applications. We analyze the time expressions from four datasets and find that
	only a small group of words are used to express time information, and the words
	in time expressions demonstrate similar syntactic behaviour. Based on the
	findings, we propose a type-based approach, named SynTime, to recognize time
	expressions. Specifically, we define three main syntactic token types, namely
	time token, modifier, and numeral, to group time-related regular expressions
	over tokens. On the types we design general heuristic rules to
	recognize time expressions. In recognition, SynTime first identifies the time
	tokens from raw text, then searches their surroundings for modifiers and
	numerals to form time segments, and finally merges the time segments to time
	expressions. As a light-weight rule-based tagger, SynTime runs in real time,
	and can be easily expanded by simply adding keywords for the text of different
	types and of different domains. Experiment on benchmark datasets and tweets
	data shows that SynTime outperforms state-of-the-art methods.},
  url       = {http://aclweb.org/anthology/P17-1039}
}

@InProceedings{luo-EtAl:2017:Long,
  author    = {Luo, Bingfeng  and  Feng, Yansong  and  Wang, Zheng  and  Zhu, Zhanxing  and  Huang, Songfang  and  Yan, Rui  and  Zhao, Dongyan},
  title     = {Learning with Noise: Enhance Distantly Supervised Relation Extraction with Dynamic Transition Matrix},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {430--439},
  abstract  = {Distant supervision significantly reduces human efforts in building training
	data for many classification tasks. While promising, this technique often
	introduces noise to the generated training data, which can severely affect the
	model performance. In this paper, we take a deep look at the application of
	distant supervision in relation extraction. We show that the dynamic transition
	matrix can effectively characterize the noise in the training data built by
	distant supervision. The transition matrix can be effectively trained using a
	novel curriculum learning based method without any direct supervision about the
	noise. We thoroughly evaluate our approach under a wide range of extraction
	scenarios. Experimental results show that our approach consistently improves
	the extraction results and outperforms the state-of-the-art in various
	evaluation scenarios.},
  url       = {http://aclweb.org/anthology/P17-1040}
}

@InProceedings{yin-neubig:2017:Long,
  author    = {Yin, Pengcheng  and  Neubig, Graham},
  title     = {A Syntactic Neural Model for General-Purpose Code Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {440--450},
  abstract  = {We consider the problem of parsing natural language descriptions into source
	code written in a general-purpose programming language like Python. Existing
	data-driven methods treat this problem as a language generation task without
	considering the underlying syntax of the target programming language. Informed
	by previous work in semantic parsing, in this paper we propose a novel neural
	architecture powered by a grammar model to explicitly capture the target syntax
	as prior knowledge. Experiments find this an effective way to scale up to
	generation of complex programs from natural language descriptions, achieving
	state-of-the-art results that well outperform previous code generation and
	semantic parsing approaches.},
  url       = {http://aclweb.org/anthology/P17-1041}
}

@InProceedings{artetxe-labaka-agirre:2017:Long,
  author    = {Artetxe, Mikel  and  Labaka, Gorka  and  Agirre, Eneko},
  title     = {Learning bilingual word embeddings with (almost) no bilingual data},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {451--462},
  abstract  = {Most methods to learn bilingual word embeddings rely on large parallel corpora,
	which is difficult to obtain for most language pairs. This has motivated an
	active research line to relax this requirement, with methods that use
	document-aligned corpora or bilingual dictionaries of a few thousand words
	instead. In this work, we further reduce the need of bilingual resources using
	a very simple self-learning approach that can be combined with any
	dictionary-based mapping technique. Our method exploits the structural
	similarity of embedding spaces, and works with as little bilingual evidence as
	a 25 word dictionary or even an automatically generated list of numerals,
	obtaining results comparable to those of systems that use richer resources.},
  url       = {http://aclweb.org/anthology/P17-1042}
}

@InProceedings{foland-martin:2017:Long,
  author    = {Foland, William  and  Martin, James H.},
  title     = {Abstract Meaning Representation Parsing using LSTM Recurrent Neural Networks},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {463--472},
  abstract  = {We present a system which parses sentences into Abstract Meaning
	Representations, improving state-of-the-art results for this task by more than
	5%.  AMR graphs represent semantic content using linguistic properties such
	as semantic roles, coreference, negation, and more.  The AMR parser does not
	rely on a syntactic pre-parse, or heavily engineered features, and uses five
	recurrent neural networks as the key architectural components for inferring AMR
	graphs.},
  url       = {http://aclweb.org/anthology/P17-1043}
}

@InProceedings{he-EtAl:2017:Long3,
  author    = {He, Luheng  and  Lee, Kenton  and  Lewis, Mike  and  Zettlemoyer, Luke},
  title     = {Deep Semantic Role Labeling: What Works and What’s Next},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {473--483},
  abstract  = {We introduce a new deep learning model for semantic role labeling (SRL) that
	significantly improves the state of the art, along with detailed analyses to
	reveal its strengths and limitations. We use a deep highway BiLSTM architecture
	with constrained decoding, while observing a number of recent best practices
	for initialization and regularization. Our 8-layer ensemble model achieves 83.2
	F1 on theCoNLL 2005 test set and 83.4 F1 on CoNLL 2012, roughly a 10% relative
	error reduction over the previous state of the art. Extensive empirical
	analysis of these gains show that (1) deep models excel at recovering
	long-distance dependencies but can still make surprisingly obvious errors, and
	(2) that there is still room for syntactic parsers to improve these results.},
  url       = {http://aclweb.org/anthology/P17-1044}
}

@InProceedings{dhingra-EtAl:2017:Long1,
  author    = {Dhingra, Bhuwan  and  Li, Lihong  and  Li, Xiujun  and  Gao, Jianfeng  and  Chen, Yun-Nung  and  Ahmed, Faisal  and  Deng, Li},
  title     = {Towards End-to-End Reinforcement Learning of Dialogue Agents for Information Access},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {484--495},
  abstract  = {This paper proposes KB-InfoBot - a multi-turn dialogue agent which helps users
	search Knowledge Bases (KBs) without composing complicated queries. Such
	goal-oriented dialogue agents typically need to interact with an external
	database to access real-world knowledge. Previous systems achieved this by
	issuing a symbolic query to the KB to retrieve entries based on their
	attributes. However, such symbolic operations break the differentiability of
	the system and prevent end-to-end training of neural dialogue agents. In this
	paper, we address this limitation by replacing symbolic queries with an induced
	``soft'' posterior distribution over the KB that indicates which entities the
	user is interested in. Integrating the soft retrieval process with a
	reinforcement learner leads to higher task success rate and reward in both
	simulations and against real users. We also present a fully neural end-to-end
	agent, trained entirely from user feedback, and discuss its application towards
	personalized dialogue agents.},
  url       = {http://aclweb.org/anthology/P17-1045}
}

@InProceedings{wu-EtAl:2017:Long1,
  author    = {Wu, Yu  and  Wu, Wei  and  Xing, Chen  and  Zhou, Ming  and  Li, Zhoujun},
  title     = {Sequential Matching Network: A New Architecture for Multi-turn Response Selection in Retrieval-Based Chatbots},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {496--505},
  abstract  = {We study response selection for multi-turn conversation in retrieval based
	chatbots. Existing work either concatenates utterances in context or matches a
	response with a highly abstract context vector finally, which may lose
	relationships among the utterances or important information in the context. We
	propose a sequential matching network (SMN) to address both problems. SMN first
	matches a response with each utterance in the context on multiple levels of
	granularity, and distills important matching information from each pair as a
	vector with convolution and pooling operations. The vectors are then
	accumulated in a chronological order through a recurrent neural network (RNN)
	which models relationships among the utterances. The final matching score is
	calculated with the hidden states of the RNN. Empirical study on two public
	data sets shows that SMN can significantly outperform state-of-the-art methods
	for response selection in multi-turn conversation.},
  url       = {http://aclweb.org/anthology/P17-1046}
}

@InProceedings{harwath-glass:2017:Long,
  author    = {Harwath, David  and  Glass, James},
  title     = {Learning Word-Like Units from Joint Audio-Visual Analysis},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {506--517},
  abstract  = {Given a collection of images and spoken audio captions, we present a method for
	discovering word-like acoustic units in the continuous speech signal and
	grounding them to semantically relevant image regions. For example, our model
	is able to detect spoken instances of the word 'lighthouse' within an utterance
	and associate them with image regions containing lighthouses. We do not use any
	form of conventional automatic speech recognition, nor do we use any text
	transcriptions or conventional linguistic annotations. Our model effectively
	implements a form of spoken language acquisition, in which the computer learns
	not only to recognize word categories by sound, but also to enrich the words it
	learns with semantics by grounding them in images.},
  url       = {http://aclweb.org/anthology/P17-1047}
}

@InProceedings{hori-watanabe-hershey:2017:Long,
  author    = {Hori, Takaaki  and  Watanabe, Shinji  and  Hershey, John},
  title     = {Joint CTC/attention decoding for end-to-end speech recognition},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {518--529},
  abstract  = {End-to-end automatic speech recognition (ASR) has become a popular alternative
	to conventional DNN/HMM systems because it avoids the need for linguistic
	resources such as pronunciation dictionary, tokenization, and
	context-dependency trees, leading to a greatly simplified model-building
	process. There are two major types of end-to-end architectures for ASR: 
	attention-based methods use an attention mechanism to perform alignment between
	acoustic frames and recognized symbols, and connectionist temporal
	classification (CTC), uses Markov assumptions to efficiently solve sequential
	problems by dynamic programming. This paper proposes joint decoding algorithm
	for end-to-end ASR with a hybrid CTC/attention architecture, which effectively
	utilizes both advantages in decoding. We have applied the proposed method to
	two ASR benchmarks (spontaneous Japanese and Mandarin Chinese), and showing the
	comparable performance to conventional state-of-the-art DNN/HMM ASR systems
	without linguistic resources.},
  url       = {http://aclweb.org/anthology/P17-1048}
}

@InProceedings{rabinovich-ordan-wintner:2017:Long,
  author    = {Rabinovich, Ella  and  Ordan, Noam  and  Wintner, Shuly},
  title     = {Found in Translation: Reconstructing Phylogenetic Language Trees from Translations},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {530--540},
  abstract  = {Translation has played an important role in trade, law, commerce, politics, and
	literature for thousands of years. Translators have always tried to be
	invisible; ideal translations should look as if they were written originally in
	the target language. We show that traces of the source language remain in the
	translation product to the extent that it is possible to uncover the history of
	the source language by looking only at the translation. Specifically, we
	automatically reconstruct phylogenetic language trees from monolingual texts
	(translated from several source languages). The signal of the source language
	is so powerful that it is retained even after two phases of translation. This
	strongly indicates that source language interference is the most dominant
	characteristic of translated texts, overshadowing the more subtle signals of
	universal properties of translation.},
  url       = {http://aclweb.org/anthology/P17-1049}
}

@InProceedings{berzak-EtAl:2017:Long,
  author    = {Berzak, Yevgeni  and  Nakamura, Chie  and  Flynn, Suzanne  and  Katz, Boris},
  title     = {Predicting Native Language from Gaze},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {541--551},
  abstract  = {A fundamental question in language learning concerns the role of a speaker's
	first language in second language acquisition. We present a novel methodology
	for studying this question: analysis of eye-movement patterns in second
	language reading of free-form text. Using this methodology, we demonstrate for
	the first time that the native language of English learners can be predicted
	from their gaze fixations when reading English. We provide analysis of
	classifier uncertainty and learned features, which indicates that differences
	in English reading are likely to be rooted in linguistic divergences across
	native languages. The presented framework complements production studies and
	offers new ground for advancing research on multilingualism.},
  url       = {http://aclweb.org/anthology/P17-1050}
}

@InProceedings{sakakini-bhat-viswanath:2017:Long,
  author    = {Sakakini, Tarek  and  Bhat, Suma  and  Viswanath, Pramod},
  title     = {MORSE: Semantic-ally Drive-n MORpheme SEgment-er},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {552--561},
  abstract  = {We present in this paper a novel framework for morpheme segmentation which uses
	the morpho-syntactic regularities preserved by word representations, in
	addition to orthographic features, to segment words into morphemes. This
	framework is the first to consider vocabulary-wide syntactico-semantic 
	information for this task. We also analyze the deficiencies  of  available
	benchmarking datasets and introduce our own dataset that was created on the
	basis of compositionality.  We validate our algorithm across datasets and
	present state-of-the-art results.},
  url       = {http://aclweb.org/anthology/P17-1051}
}

@InProceedings{johnson-zhang:2017:Long,
  author    = {Johnson, Rie  and  Zhang, Tong},
  title     = {Deep Pyramid Convolutional Neural Networks for Text Categorization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {562--570},
  abstract  = {This paper proposes a low-complexity word-level deep convolutional neural
	network (CNN) architecture for text categorization that can efficiently
	represent long-range associations in text.  In the literature, several deep and
	complex neural networks have been proposed for this task, assuming availability
	of relatively large amounts of training data.  However, the associated
	computational complexity increases as the networks go deeper, which poses
	serious challenges in practical applications.  Moreover, it was shown recently
	that shallow word-level CNNs are more accurate and much faster than the
	state-of-the-art very deep nets such as character-level CNNs even in the
	setting of large training data.  Motivated by these findings, we carefully
	studied deepening of word-level CNNs to capture global representations of text,
	and found a simple network architecture with which the best accuracy can be
	obtained by increasing the network depth without increasing computational cost
	by much.  We call it deep pyramid CNN.                                               
	  The
	proposed
	model
	with 15
	weight
	layers outperforms the previous best models on six benchmark datasets for
	sentiment classification and topic categorization.},
  url       = {http://aclweb.org/anthology/P17-1052}
}

@InProceedings{yu-EtAl:2017:Long,
  author    = {Yu, Mo  and  Yin, Wenpeng  and  Hasan, Kazi Saidul  and  dos Santos, Cicero  and  Xiang, Bing  and  Zhou, Bowen},
  title     = {Improved Neural Relation Detection for Knowledge Base Question Answering},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {571--581},
  abstract  = {Relation detection is a core component of many NLP applications including
	Knowledge Base Question Answering (KBQA). In this paper, we propose a
	hierarchical recurrent neural network enhanced by residual learning which
	detects KB relations given an input question. Our method uses deep residual
	bidirectional LSTMs to compare questions and relation names via different
	levels of abstraction. Additionally, we propose a simple KBQA system that
	integrates entity linking and our proposed relation detector to make the two
	components enhance each other. Our experimental results show that our approach
	not only achieves outstanding relation detection performance, but more
	importantly, it helps our KBQA system achieve state-of-the-art accuracy for
	both single-relation (SimpleQuestions) and multi-relation (WebQSP) QA
	benchmarks.},
  url       = {http://aclweb.org/anthology/P17-1053}
}

@InProceedings{meng-EtAl:2017:Long,
  author    = {Meng, Rui  and  Zhao, Sanqiang  and  Han, Shuguang  and  He, Daqing  and  Brusilovsky, Peter  and  Chi, Yu},
  title     = {Deep Keyphrase Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {582--592},
  abstract  = {Keyphrase provides highly-summative information that can be effectively used
	for understanding, organizing and retrieving text content. Though previous
	studies have provided many workable solutions for automated keyphrase
	extraction, they commonly divided the to-be-summarized content into multiple
	text chunks, then ranked and selected the most meaningful ones. These
	approaches could neither identify keyphrases that do not appear in the text,
	nor capture the real semantic meaning behind the text. We propose a generative
	model for keyphrase prediction with an encoder-decoder framework, which can
	effectively overcome the above drawbacks.  We name it as \textit{deep keyphrase
	generation} since it attempts to capture the deep semantic meaning of the
	content with a deep learning method. Empirical analysis on six datasets
	demonstrates that our proposed model not only achieves a significant
	performance boost on extracting keyphrases that appear in the source text, but
	also can generate absent keyphrases based on the semantic meaning of the text.
	Code and dataset are available at https://github.com/memray/seq2seq-keyphrase.},
  url       = {http://aclweb.org/anthology/P17-1054}
}

@InProceedings{cui-EtAl:2017:Long,
  author    = {Cui, Yiming  and  Chen, Zhipeng  and  Wei, Si  and  Wang, Shijin  and  Liu, Ting  and  Hu, Guoping},
  title     = {Attention-over-Attention Neural Networks for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {593--602},
  abstract  = {Cloze-style reading comprehension is a representative problem in mining
	relationship between document and query.
	In this paper, we present a simple but novel model called
	attention-over-attention reader for better solving cloze-style reading
	comprehension task.
	The proposed model aims to place another attention mechanism over the
	document-level attention and induces ``attended attention'' for final answer
	predictions.
	One advantage of our model is that it is simpler than related works while
	giving excellent performance.
	In addition to the primary model, we also propose an N-best re-ranking strategy
	to double check the validity of the candidates and further improve the
	performance.
	Experimental results show that the proposed methods significantly outperform
	various state-of-the-art systems by a large margin in public datasets, such as
	CNN and Children's Book Test.},
  url       = {http://aclweb.org/anthology/P17-1055}
}

@InProceedings{doyle-EtAl:2017:Long,
  author    = {Doyle, Gabriel  and  Goldberg, Amir  and  Srivastava, Sameer  and  Frank, Michael},
  title     = {Alignment at Work: Using Language to Distinguish the Internalization and Self-Regulation Components of Cultural Fit in Organizations},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {603--612},
  abstract  = {Cultural fit is widely believed to affect the success of individuals and the
	groups to which they belong. Yet it remains an elusive, poorly measured
	construct. Recent research draws on computational linguistics to measure
	cultural fit but overlooks asymmetries in cultural adaptation. By contrast, we
	develop a directed, dynamic measure of cultural fit based on linguistic
	alignment, which estimates the influence of one person's word use on another's
	and distinguishes between two enculturation mechanisms: internalization and
	self-regulation. We use this measure to trace employees' enculturation
	trajectories over a large, multi-year corpus of corporate emails and find that
	patterns of alignment in the first six months of employment are predictive of
	individuals’ downstream outcomes, especially involuntary exit. Further
	predictive analyses suggest referential alignment plays an overlooked role in
	linguistic alignment.},
  url       = {http://aclweb.org/anthology/P17-1056}
}

@InProceedings{chrupala-gelderloos-alishahi:2017:Long,
  author    = {Chrupa{\l}a, Grzegorz  and  Gelderloos, Lieke  and  Alishahi, Afra},
  title     = {Representations of language in a model of visually grounded speech signal},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {613--622},
  abstract  = {We present a visually grounded model of speech perception which projects spoken
	utterances and images to a joint semantic space. We use a multi-layer recurrent
	highway network to model the temporal nature of spoken speech, and show that it
	learns to extract both form and meaning-based linguistic knowledge from the
	input signal. We carry out an in-depth analysis of the representations used by
	different components of the trained model and show that encoding of semantic
	aspects tends to become richer as we go up the hierarchy of layers, whereas
	encoding of form-related aspects of the language input tends to initially
	increase and then plateau or decrease.},
  url       = {http://aclweb.org/anthology/P17-1057}
}

@InProceedings{xu-reitter:2017:Long,
  author    = {Xu, Yang  and  Reitter, David},
  title     = {Spectral Analysis of Information Density in Dialogue Predicts Collaborative Task Performance},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {623--633},
  abstract  = {We propose a perspective on dialogue that focuses on relative information
	contributions of conversation partners as a key to successful communication. We
	predict the success of collaborative task in English and Danish corpora of
	task-oriented dialogue. Two features are extracted from the frequency domain
	representations of  the lexical entropy series of each interlocutor, power
	spectrum overlap (PSO) and relative phase (RP). We find that PSO is a negative
	predictor of task success, while RP is a positive one. An SVM with these
	features significantly improved on previous task success prediction models. Our
	findings suggest that the strategic distribution of information density between
	interlocutors  is relevant to task success.},
  url       = {http://aclweb.org/anthology/P17-1058}
}

@InProceedings{ghosh-EtAl:2017:Long,
  author    = {Ghosh, Sayan  and  Chollet, Mathieu  and  Laksana, Eugene  and  Morency, Louis-Philippe  and  Scherer, Stefan},
  title     = {Affect-LM: A Neural Language Model for Customizable Affective Text Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {634--642},
  abstract  = {Human verbal communication includes affective messages which are conveyed
	through use of emotionally colored words. There has been a lot of research
	effort in this direction but the problem of integrating state-of-the-art neural
	language models with affective information remains an area ripe for
	exploration. In this paper, we propose an extension to an LSTM (Long Short-Term
	Memory) language model for generation of conversational text, conditioned on
	affect categories. Our proposed model, Affect-LM enables us to customize the
	degree of emotional content in generated sentences through an additional design
	parameter. Perception studies conducted using Amazon Mechanical Turk show that
	Affect-LM can generate naturally looking emotional sentences without
	sacrificing grammatical correctness. Affect-LM also learns
	affect-discriminative word representations, and perplexity experiments show
	that additional affective information in conversational text can improve
	language model prediction.},
  url       = {http://aclweb.org/anthology/P17-1059}
}

@InProceedings{kim-stratos-kim:2017:Long1,
  author    = {Kim, Young-Bum  and  Stratos, Karl  and  Kim, Dongchan},
  title     = {Domain Attention with an Ensemble of Experts},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {643--653},
  abstract  = {An important problem in domain adaptation is to quickly generalize to a new
	domain with limited supervision given K existing domains. One approach is to
	retrain a global model across all K + 1 domains using standard techniques, for
	instance Daum´e III (2009). However, it is desirable to adapt without having
	to re-estimate a global model from scratch each time a new domain with
	potentially new intents and slots is added. We describe a solution based on
	attending an ensemble of domain experts. We assume K domain specific intent and
	slot models trained on respective domains. When given domain K + 1, our model
	uses a weighted combination of the K domain experts’ feedback along with its
	own opinion to make predictions on the new domain. In experiments,
	the model significantly outperforms baselines that do not use domain adaptation
	and also performs better than the full retraining approach.},
  url       = {http://aclweb.org/anthology/P17-1060}
}

@InProceedings{zhao-zhao-eskenazi:2017:Long,
  author    = {Zhao, Tiancheng  and  Zhao, Ran  and  Eskenazi, Maxine},
  title     = {Learning Discourse-level Diversity for Neural Dialog Models using Conditional Variational Autoencoders},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {654--664},
  abstract  = {While recent neural encoder-decoder models have shown great promise in modeling
	open-domain conversations, they often generate dull and generic responses.
	Unlike past work that has focused on diversifying the output of the decoder
	from word-level to alleviate this problem, we present a novel framework based
	on conditional variational autoencoders that capture the discourse-level
	diversity in the encoder. Our model uses latent variables to learn a
	distribution over potential conversational intents and generates diverse
	responses using only greedy decoders. We have further developed a novel variant
	that is integrated with linguistic prior knowledge for better performance.
	Finally, the training procedure is improved through introducing a bag-of-word
	loss. Our proposed models have been validated to generate significantly more
	diverse responses than baseline approaches and exhibit competence of
	discourse-level decision-making.},
  url       = {http://aclweb.org/anthology/P17-1061}
}

@InProceedings{williams-asadi-zweig:2017:Long,
  author    = {Williams, Jason D  and  Asadi, Kavosh  and  Zweig, Geoffrey},
  title     = {Hybrid Code Networks: practical and efficient end-to-end dialog control with supervised and reinforcement learning},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {665--677},
  abstract  = {End-to-end learning of recurrent neural networks (RNNs) is an attractive
	solution for dialog systems; however, current techniques are data-intensive and
	require thousands of dialogs to learn simple behaviors.  We introduce Hybrid
	Code Networks (HCNs), which combine an RNN with domain-specific knowledge
	encoded as software and system action templates. Compared to existing
	end-to-end approaches, HCNs considerably reduce the amount of training data
	required, while retaining the key benefit of inferring a latent representation
	of dialog state. In addition, HCNs can be optimized with supervised learning,
	reinforcement learning, or a mixture of both. HCNs attain state-of-the-art
	performance on the bAbI dialog dataset (Bordes and Weston, 2016), and
	outperform two commercially deployed customer-facing dialog systems at our
	company.},
  url       = {http://aclweb.org/anthology/P17-1062}
}

@InProceedings{villalba-teichmann-koller:2017:Long,
  author    = {Villalba, Martin  and  Teichmann, Christoph  and  Koller, Alexander},
  title     = {Generating Contrastive Referring Expressions},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {678--687},
  abstract  = {The referring expressions (REs) produced by a natural language generation (NLG)
	system can be misunderstood by the hearer, even when they are semantically
	correct. In an interactive setting, the NLG system can try to recognize such
	misunderstandings and correct them. We present an algorithm for generating
	corrective REs that use contrastive focus (“no, the BLUE button”) to
	emphasize the information the hearer most likely misunderstood. We show
	empirically that these contrastive REs are preferred over REs without contrast
	marking.},
  url       = {http://aclweb.org/anthology/P17-1063}
}

@InProceedings{li-EtAl:2017:Long,
  author    = {Li, Junhui  and  Xiong, Deyi  and  Tu, Zhaopeng  and  Zhu, Muhua  and  Zhang, Min  and  Zhou, Guodong},
  title     = {Modeling Source Syntax for Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {688--697},
  abstract  = {Even though a linguistics-free sequence to sequence model in neural machine
	translation (NMT) has certain capability of implicitly learning syntactic
	information of source sentences, this paper shows that source syntax can be
	explicitly incorporated into NMT effectively to provide further improvements.
	Specifically, we linearize parse trees of source sentences to obtain structural
	label sequences. On the basis, we propose three different sorts of encoders to
	incorporate source syntax into NMT: 1) Parallel RNN encoder that learns word
	and label annotation vectors parallelly; 2) Hierarchical RNN encoder that
	learns word and label annotation vectors in a two-level hierarchy; and 3) Mixed
	RNN encoder that stitchingly learns word and label annotation vectors over
	sequences where words and labels are mixed. Experimentation on
	Chinese-to-English translation demonstrates that all the three proposed
	syntactic encoders are able to improve translation accuracy. It is interesting
	to note that the simplest RNN encoder, i.e., Mixed RNN encoder yields the best
	performance with an significant improvement of 1.4 BLEU points. Moreover, an
	in-depth analysis from several perspectives is provided to reveal how source
	syntax benefits NMT.},
  url       = {http://aclweb.org/anthology/P17-1064}
}

@InProceedings{wu-EtAl:2017:Long2,
  author    = {Wu, Shuangzhi  and  Zhang, Dongdong  and  Yang, Nan  and  Li, Mu  and  Zhou, Ming},
  title     = {Sequence-to-Dependency Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {698--707},
  abstract  = {Nowadays a typical Neural Machine Translation (NMT) model generates
	translations from left to right as a linear sequence, during which latent
	syntactic structures of the target sentences are not explicitly concerned.
	Inspired by the success of using syntactic knowledge of target language for
	improving statistical machine translation,
	in this paper we propose a novel Sequence-to-Dependency Neural Machine
	Translation (SD-NMT) method, in which the target word sequence and its
	corresponding dependency structure are jointly constructed and modeled, and
	this structure is used as context to facilitate word generations. Experimental
	results show that the proposed method significantly outperforms
	state-of-the-art baselines on Chinese-English and Japanese-English translation
	tasks.},
  url       = {http://aclweb.org/anthology/P17-1065}
}

@InProceedings{ma-gao-wong:2017:Long,
  author    = {Ma, Jing  and  Gao, Wei  and  Wong, Kam-Fai},
  title     = {Detect Rumors in Microblog Posts Using Propagation Structure via Kernel Learning},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {708--717},
  abstract  = {How fake news goes viral via social media? How does its propagation pattern
	differ from real stories? In this paper, we attempt to address the problem of
	identifying rumors, i.e., fake information, out of microblog posts based on
	their propagation structure. We firstly model microblog posts diffusion with
	propagation trees, which provide valuable clues on how an original message is
	transmitted and developed over time. We then propose a kernel-based method
	called Propagation Tree Kernel, which captures high-order patterns
	differentiating different types of rumors by evaluating the similarities
	between their propagation tree structures. Experimental results on two
	real-world datasets demonstrate that the proposed kernel-based approach can
	detect rumors more quickly and accurately than state-of-the-art rumor detection
	models.},
  url       = {http://aclweb.org/anthology/P17-1066}
}

@InProceedings{abdulmageed-ungar:2017:Long,
  author    = {Abdul-Mageed, Muhammad  and  Ungar, Lyle},
  title     = {EmoNet: Fine-Grained Emotion Detection with Gated Recurrent Neural Networks},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {718--728},
  abstract  = {Accurate detection of emotion from natural language has applications ranging
	from building emotional chatbots to better understanding individuals and their
	lives. However, progress on emotion detection has been hampered by the absence
	of large labeled datasets.  In this work, we build a very large dataset for
	fine-grained emotions and develop deep learning models on it. We achieve a new
	state-of-the-art on 24 fine-grained types of emotions (with an average accuracy
	of 87.58%). We also extend the task beyond emotion types to model Robert
	Plutick's 8 primary emotion dimensions, acquiring a superior accuracy of
	95.68%.},
  url       = {http://aclweb.org/anthology/P17-1067}
}

@InProceedings{preoiucpietro-EtAl:2017:Long,
  author    = {Preoţiuc-Pietro, Daniel  and  Liu, Ye  and  Hopkins, Daniel  and  Ungar, Lyle},
  title     = {Beyond Binary Labels: Political Ideology Prediction of Twitter Users},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {729--740},
  abstract  = {Automatic political orientation prediction from social media posts has to date
	proven successful only in distinguishing between publicly declared liberals and
	conservatives in the US. This study examines users’ political ideology using
	a seven-point scale which enables us to identify politically moderate and
	neutral users -- groups which are of particular interest to political
	scientists and pollsters. Using a novel data set with political ideology labels
	self-reported through surveys, our goal is two-fold: a) to characterize the
	groups of politically engaged users through language use on Twitter; b) to
	build a fine-grained model that predicts political ideology of unseen users.
	Our results identify differences in both political leaning and engagement and
	the extent to which each group tweets using political keywords. Finally, we
	demonstrate how to improve ideology prediction accuracy by exploiting the
	relationships between the user groups.},
  url       = {http://aclweb.org/anthology/P17-1068}
}

@InProceedings{johnson-jin-goldwasser:2017:Long,
  author    = {Johnson, Kristen  and  Jin, Di  and  Goldwasser, Dan},
  title     = {Leveraging Behavioral and Social Information for Weakly Supervised Collective Classification of Political Discourse on Twitter},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {741--752},
  abstract  = {Framing is a political strategy in which politicians carefully word their
	statements in order to control public perception of issues. Previous works
	exploring political framing typically analyze frame usage in longer texts, such
	as congressional speeches. We present a collection of weakly supervised models
	which harness collective classification to predict the frames used in political
	discourse on the microblogging platform, Twitter. Our global probabilistic
	models show that by combining both lexical features of tweets and network-based
	behavioral features of Twitter, we are able to increase the average,
	unsupervised F1 score by 21.52 points over a lexical baseline alone.},
  url       = {http://aclweb.org/anthology/P17-1069}
}

@InProceedings{ji-EtAl:2017:Long,
  author    = {Ji, Jianshu  and  Wang, Qinlong  and  Toutanova, Kristina  and  Gong, Yongen  and  Truong, Steven  and  Gao, Jianfeng},
  title     = {A Nested Attention Neural Hybrid Model for Grammatical Error Correction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {753--762},
  abstract  = {Grammatical error correction (GEC) systems strive to correct both global errors
	inword order and usage, and local errors inspelling and inflection. Further
	developing upon recent work on neural machine translation, we propose a new
	hybrid neural model with nested attention layers for GEC.Experiments show that
	the new model can effectively correct errors of both types by incorporating
	word and character-level information, and that the model significantly
	outperforms previous  neural models for GEC as measured on the standard
	CoNLL-14 benchmark dataset.Further analysis also shows that the superiority of
	the proposed model can be largely attributed to the use of the nested attention
	mechanism, which has proven particularly effective incorrecting local errors
	that involve small edits in orthography.},
  url       = {http://aclweb.org/anthology/P17-1070}
}

@InProceedings{mrabet-kilicoglu-demnerfushman:2017:Long,
  author    = {Mrabet, Yassine  and  Kilicoglu, Halil  and  Demner-Fushman, Dina},
  title     = {TextFlow: A Text Similarity Measure based on Continuous Sequences},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {763--772},
  abstract  = {Text similarity measures are used in multiple tasks such as plagiarism
	detection, information ranking and recognition of paraphrases and textual
	entailment. While recent advances in deep learning highlighted the relevance of
	sequential models in natural language generation, existing similarity measures
	do not fully exploit the sequential nature of language. Examples of such
	similarity measures include n-grams and skip-grams overlap which rely on
	distinct slices of the input texts. In this paper we present a novel text
	similarity measure inspired from a common representation in DNA sequence
	alignment algorithms. The new measure, called TextFlow, represents input text
	pairs as continuous curves and uses both the actual position of the words and
	sequence matching to compute the similarity value. Our experiments on 8
	different datasets show very encouraging results in paraphrase detection,
	textual entailment recognition and ranking relevance.},
  url       = {http://aclweb.org/anthology/P17-1071}
}

@InProceedings{tan-card-smith:2017:Long,
  author    = {Tan, Chenhao  and  Card, Dallas  and  Smith, Noah A.},
  title     = {Friendships, Rivalries, and Trysts: Characterizing Relations between Ideas in Texts},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {773--783},
  abstract  = {Understanding how ideas relate to each other is a fundamental question in many
	domains, ranging from intellectual history to public communication. Because
	ideas are naturally embedded in texts, we propose the first framework to
	systematically characterize the relations between ideas based on their
	occurrence in a corpus of documents, independent of how these ideas are
	represented. Combining two statistics—cooccurrence within documents and
	prevalence correlation over time—our approach reveals a number of different
	ways in which ideas can cooperate and compete. For instance, two ideas can
	closely track each other’s prevalence over time, and yet rarely cooccur,
	almost like a “cold war” scenario. We observe that pairwise cooccurrence
	and prevalence correlation exhibit different distributions. We further
	demonstrate that our approach is able to uncover intriguing relations between
	ideas through in-depth case studies on news articles and research papers.},
  url       = {http://aclweb.org/anthology/P17-1072}
}

@InProceedings{wroblewska-krasnowskakieras:2017:Long,
  author    = {Wr\'{o}blewska, Alina  and  Krasnowska-Kiera\'{s}, Katarzyna},
  title     = {Polish evaluation dataset for compositional distributional semantics models},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {784--792},
  abstract  = {The paper presents a procedure of building an evaluation dataset. for the
	validation of compositional distributional semantics models estimated for
	languages other than English. The procedure generally builds on steps designed
	to assemble the SICK corpus, which contains pairs of English sentences
	annotated for semantic relatedness and entailment, because we aim at building a
	comparable dataset. However, the implementation of particular building steps
	significantly differs from the original SICK design assumptions, which is
	caused by both lack of necessary extraneous resources for an investigated
	language and the need for language-specific transformation rules. The designed
	procedure is verified on Polish, a fusional language with a relatively free
	word order, and contributes to building a Polish evaluation dataset. The
	resource consists of 10K sentence pairs which are human-annotated for semantic
	relatedness and entailment. The dataset may be used for the evaluation of
	compositional distributional semantics models of Polish.},
  url       = {http://aclweb.org/anthology/P17-1073}
}

@InProceedings{bryant-felice-briscoe:2017:Long,
  author    = {Bryant, Christopher  and  Felice, Mariano  and  Briscoe, Ted},
  title     = {Automatic Annotation and Evaluation of Error Types for Grammatical Error Correction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {793--805},
  abstract  = {Until now, error type performance for Grammatical Error Correction (GEC)
	systems could only be measured in terms of recall because system output is not
	annotated. To overcome this problem, we introduce ERRANT, a grammatical ERRor
	ANnotation Toolkit designed to automatically extract edits from parallel
	original and corrected sentences and classify them according to a new,
	dataset-agnostic, rule-based framework. This not only facilitates error type
	evaluation at different levels of granularity, but can also be used to reduce
	annotator workload and standardise existing GEC datasets. Human experts rated
	the automatic edits as ``Good'' or ``Acceptable'' in at least 95% of cases, so
	we applied ERRANT to the system output of the CoNLL-2014 shared task to carry
	out a detailed error type analysis for the first time.},
  url       = {http://aclweb.org/anthology/P17-1074}
}

@InProceedings{sugawara-EtAl:2017:Long,
  author    = {Sugawara, Saku  and  Kido, Yusuke  and  Yokono, Hikaru  and  Aizawa, Akiko},
  title     = {Evaluation Metrics for Machine Reading Comprehension: Prerequisite Skills and Readability},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {806--817},
  abstract  = {Knowing the quality of reading comprehension (RC) datasets is important for the
	development of natural-language understanding systems.
	  In this study, two classes of metrics were adopted for evaluating RC
	datasets: prerequisite skills and readability. We applied these classes to six
	existing datasets, including MCTest and SQuAD, and highlighted the
	characteristics of the datasets according to each metric and the correlation
	between the two classes.
	  Our dataset analysis suggests that the readability of RC datasets does not
	directly affect the question difficulty and that it is possible to create an RC
	dataset that is easy to read but difficult to answer.},
  url       = {http://aclweb.org/anthology/P17-1075}
}

@InProceedings{stern-andreas-klein:2017:Long,
  author    = {Stern, Mitchell  and  Andreas, Jacob  and  Klein, Dan},
  title     = {A Minimal Span-Based Neural Constituency Parser},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {818--827},
  abstract  = {In this work, we present a minimal neural model for constituency parsing based
	on independent scoring of labels and spans. We show that this model is not only
	compatible with classical dynamic programming techniques, but also admits a
	novel greedy top-down inference algorithm based on recursive partitioning of
	the input. We demonstrate empirically that both prediction schemes are
	competitive with recent work, and when combined with basic extensions to the
	scoring model are capable of achieving state-of-the-art single-model
	performance on the Penn Treebank (91.79 F1) and strong performance on the
	French Treebank (82.23 F1).},
  url       = {http://aclweb.org/anthology/P17-1076}
}

@InProceedings{sun-cao-wan:2017:Long,
  author    = {Sun, Weiwei  and  Cao, Junjie  and  Wan, Xiaojun},
  title     = {Semantic Dependency Parsing via Book Embedding},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {828--838},
  abstract  = {We model a dependency graph as a book, a particular kind of topological space,
	for
	semantic dependency parsing. The spine of the book is made up of a sequence of
	words, and each page contains a subset of noncrossing arcs. To build a semantic
	graph for a given sentence, we design new Maximum Subgraph algorithms to
	generate noncrossing graphs on each page, and a Lagrangian Relaxation-based
	algorithm tocombine pages into a book. Experiments demonstrate the
	effectiveness of the bookembedding framework across a wide range of conditions.
	Our parser obtains
	comparable results with a state-of-the-art
	transition-based parser.},
  url       = {http://aclweb.org/anthology/P17-1077}
}

@InProceedings{yang-zhang-dong:2017:Long,
  author    = {Yang, Jie  and  Zhang, Yue  and  Dong, Fei},
  title     = {Neural Word Segmentation with Rich Pretraining},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {839--849},
  abstract  = {Neural word segmentation research has benefited from large-scale raw texts by
	leveraging them for pretraining character and word embeddings. On the other
	hand, statistical segmentation research has exploited richer sources of
	external information, such as punctuation, automatic segmentation and POS. We
	investigate the effectiveness of a range of external training sources for
	neural word segmentation by building a modular segmentation model, pretraining
	the most important submodule using rich external sources. Results show that
	such pretraining significantly improves the model, leading to accuracies
	competitive to the best methods on six benchmarks.},
  url       = {http://aclweb.org/anthology/P17-1078}
}

@InProceedings{oda-EtAl:2017:Long,
  author    = {Oda, Yusuke  and  Arthur, Philip  and  Neubig, Graham  and  Yoshino, Koichiro  and  Nakamura, Satoshi},
  title     = {Neural Machine Translation via Binary Code Prediction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {850--860},
  abstract  = {In this paper, we propose a new method for calculating the output layer in
	neural machine translation systems. The method is based on predicting a
	binary code for each word and can reduce computation time/memory requirements
	of the output layer to be logarithmic in vocabulary size in the best case.
	In addition, we also introduce two advanced approaches to improve the
	robustness of the proposed model: using error-correcting codes and combining
	softmax and binary codes. Experiments on two English-Japanese bidirectional
	translation tasks show proposed models achieve BLEU scores that approach the
	softmax, while reducing memory usage to the order of less than 1/10 and
	improving decoding speed on CPUs by x5 to x10.},
  url       = {http://aclweb.org/anthology/P17-1079}
}

@InProceedings{belinkov-EtAl:2017:Long,
  author    = {Belinkov, Yonatan  and  Durrani, Nadir  and  Dalvi, Fahim  and  Sajjad, Hassan  and  Glass, James},
  title     = {What do Neural Machine Translation Models Learn about Morphology?},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {861--872},
  abstract  = {Neural machine translation (MT) models obtain state-of-the-art performance
	while maintaining a simple, end-to-end architecture. However, little is known
	about what these models learn about source and target languages during the
	training process. In this work, we analyze the representations learned by
	neural MT models at various levels of granularity and empirically evaluate the
	quality of the representations for learning morphology through extrinsic
	part-of-speech and morphological tagging tasks. We conduct a thorough
	investigation along several parameters: word-based vs. character-based
	representations, depth of the encoding layer, the identity of the target
	language, and encoder vs. decoder representations. Our data-driven,
	quantitative evaluation sheds light on important aspects in the neural MT
	system and its ability to capture word structure.},
  url       = {http://aclweb.org/anthology/P17-1080}
}

@InProceedings{poria-EtAl:2017:Long,
  author    = {Poria, Soujanya  and  Cambria, Erik  and  Hazarika, Devamanyu  and  Majumder, Navonil  and  Zadeh, Amir  and  Morency, Louis-Philippe},
  title     = {Context-Dependent Sentiment Analysis in User-Generated Videos},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {873--883},
  abstract  = {Multimodal sentiment analysis is a developing
	area of research, which involves
	the identification of sentiments in videos.
	Current research considers utterances as
	independent entities, i.e., ignores the interdependencies
	and relations among the utterances
	of a video. In this paper, we propose
	a LSTM-based model that enables
	utterances to capture contextual information
	from their surroundings in the same
	video, thus aiding the classification process.
	Our method shows 5-10% performance
	improvement over the state of the
	art and high robustness to generalizability.},
  url       = {http://aclweb.org/anthology/P17-1081}
}

@InProceedings{pavalanathan-EtAl:2017:Long,
  author    = {Pavalanathan, Umashanthi  and  Fitzpatrick, Jim  and  Kiesling, Scott  and  Eisenstein, Jacob},
  title     = {A Multidimensional Lexicon for Interpersonal Stancetaking},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {884--895},
  abstract  = {The sociolinguistic construct of stancetaking describes the activities through
	which discourse participants create and signal relationships to their
	interlocutors, to the topic of discussion, and to the talk itself. Stancetaking
	underlies a wide range of interactional phenomena, relating to formality,
	politeness, affect, and subjectivity. We present a computational approach to
	stancetaking, in which we build a theoretically-motivated lexicon of stance
	markers, and then use multidimensional analysis to identify a set of underlying
	stance dimensions. We validate these dimensions intrinscially and
	extrinsically, showing that they are internally coherent, match pre-registered
	hypotheses, and correlate with social phenomena.},
  url       = {http://aclweb.org/anthology/P17-1082}
}

@InProceedings{lund-EtAl:2017:Long,
  author    = {Lund, Jeffrey  and  Cook, Connor  and  Seppi, Kevin  and  Boyd-Graber, Jordan},
  title     = {Tandem Anchoring: a Multiword Anchor Approach for Interactive Topic Modeling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {896--905},
  abstract  = {Interactive topic models are powerful tools
	for those seeking to understand large
	collections of text. However, existing
	sampling-based interactive topic modeling
	approaches scale poorly to large data sets.
	Anchor methods, which use a single word
	to uniquely identify a topic, offer the speed
	needed for interactive work but lack both
	a mechanism to inject prior knowledge
	and lack the intuitive semantics needed
	for user-facing applications. We propose
	combinations of words as anchors, go-
	ing beyond existing single word anchor
	algorithms—an approach we call “Tan-
	dem Anchors”. We begin with a synthetic
	investigation of this approach then apply
	the approach to interactive topic modeling
	in a user study and compare it to interac-
	tive and non-interactive approaches. Tan-
	dem anchors are faster and more intuitive
	than existing interactive approaches.},
  url       = {http://aclweb.org/anthology/P17-1083}
}

@InProceedings{bakhshandeh-allen:2017:Long,
  author    = {Bakhshandeh, Omid  and  Allen, James},
  title     = {Apples to Apples: Learning Semantics of Common Entities Through a Novel Comprehension Task},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {906--916},
  abstract  = {Understanding common entities and their attributes is a primary requirement for
	any system that comprehends natural language. In order to enable learning about
	common entities, we introduce a novel machine comprehension task, GuessTwo:
	given a short paragraph comparing different aspects of two real-world
	semantically-similar entities, a system should guess what those entities are.
	Accomplishing this task requires deep language understanding which enables
	inference, connecting each comparison paragraph to different levels of
	knowledge about world entities and their attributes. So far we have
	crowdsourced a dataset of more than 14K comparison paragraphs comparing
	entities from a variety of categories such as fruits and animals. We have
	designed two schemes for evaluation: open-ended, and binary-choice prediction.
	For benchmarking further progress in the task, we have collected a set of
	paragraphs as the test set on which human can accomplish the task with an
	accuracy of 94.2% on open-ended prediction. We have implemented various models
	for tackling the task, ranging from semantic-driven to neural models. The
	semantic-driven approach outperforms the neural models, however, the results
	indicate that the task is very challenging across the models.},
  url       = {http://aclweb.org/anthology/P17-1084}
}

@InProceedings{katiyar-cardie:2017:Long,
  author    = {Katiyar, Arzoo  and  Cardie, Claire},
  title     = {Going out on a limb: Joint Extraction of Entity Mentions and Relations without Dependency Trees},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {917--928},
  abstract  = {We present a novel attention-based recurrent neural network for joint
	extraction of entity mentions and relations. We show that attention along with
	long short term memory (LSTM) network can extract semantic relations between
	entity mentions without having access to dependency trees. 
	Experiments on Automatic Content Extraction (ACE) corpora show that our model
	significantly outperforms feature-based joint model by Li and Ji (2014). We
	also compare our model with an end-to-end tree-based LSTM model (SPTree) by
	Miwa and Bansal (2016) and show that our model performs within 1% on entity
	mentions and 2% on relations. Our fine-grained analysis also shows that our
	model performs significantly better on Agent-Artifact relations, while SPTree
	performs better on Physical and Part-Whole relations.},
  url       = {http://aclweb.org/anthology/P17-1085}
}

@InProceedings{wang-EtAl:2017:Long3,
  author    = {Wang, Sida I.  and  Ginn, Samuel  and  Liang, Percy  and  Manning, Christopher D.},
  title     = {Naturalizing a Programming Language via Interactive Learning},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {929--938},
  abstract  = {Our goal is to create a convenient natural language interface for performing
	well-specified but complex actions such as analyzing data, manipulating text,
	and querying databases. However, existing natural language interfaces for such
	tasks are quite primitive compared to the power one wields with a programming
	language. To bridge this gap, we start with a core programming language and
	allow users to ``naturalize'' the core language incrementally by defining
	alternative, more natural syntax and increasingly complex concepts in terms of
	compositions of simpler ones. In a voxel world, we show that a community of
	users can simultaneously teach a common system a diverse language and use it to
	build hundreds of complex voxel structures. Over the course of three days,
	these users went from using only the core language to using the naturalized
	language in 85.9% of the last 10K utterances.},
  url       = {http://aclweb.org/anthology/P17-1086}
}

@InProceedings{sedoc-EtAl:2017:Long,
  author    = {Sedoc, Joao  and  Gallier, Jean  and  Foster, Dean  and  Ungar, Lyle},
  title     = {Semantic Word Clusters Using Signed Spectral Clustering},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {939--949},
  abstract  = {Vector space representations of words capture many aspects of word similarity,
	but such methods tend to produce vector spaces in which antonyms (as well as
	synonyms) are close to each other. For spectral clustering using such word
	embeddings, words are points in a vector space where synonyms are linked with
	positive weights, while antonyms are linked with negative weights. We present a
	new signed spectral normalized graph cut algorithm, signed clustering,
	that overlays existing thesauri upon distributionally derived vector
	representations of words, so that antonym relationships between word pairs are
	represented by negative weights. Our signed clustering algorithm produces
	clusters of words that simultaneously capture distributional and synonym
	relations.  
	By using randomized spectral decomposition (Halko et al., 2011) and sparse
	matrices, our method is both fast and scalable. We validate our clusters using
	datasets containing human judgments of word pair similarities and show the
	benefit of using our word clusters for sentiment prediction.},
  url       = {http://aclweb.org/anthology/P17-1087}
}

@InProceedings{xie-EtAl:2017:Long,
  author    = {Xie, Qizhe  and  Ma, Xuezhe  and  Dai, Zihang  and  Hovy, Eduard},
  title     = {An Interpretable Knowledge Transfer Model for Knowledge Base Completion},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {950--962},
  abstract  = {Knowledge bases are important resources for a variety of natural language
	processing tasks but suffer from incompleteness. We propose a novel embedding
	model, ITransF, to perform knowledge base completion. Equipped with a
	sparse attention mechanism, ITransF discovers hidden concepts of relations and
	transfer statistical strength through the sharing of concepts. Moreover, the
	learned associations between relations and concepts, which are represented by
	sparse attention vectors, can be interpreted easily.
	We evaluate ITransF on two benchmark datasets---WN18 and FB15k for knowledge
	base completion and obtains improvements on both the mean rank and Hits$@$10
	metrics, over all baselines that do not use additional information.},
  url       = {http://aclweb.org/anthology/P17-1088}
}

@InProceedings{iyer-EtAl:2017:Long,
  author    = {Iyer, Srinivasan  and  Konstas, Ioannis  and  Cheung, Alvin  and  Krishnamurthy, Jayant  and  Zettlemoyer, Luke},
  title     = {Learning a Neural Semantic Parser from User Feedback},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {963--973},
  abstract  = {We present an approach to rapidly and easily build natural language interfaces
	to databases for new domains, whose performance improves over time based on
	user feedback, and requires minimal intervention. To achieve this, we adapt
	neural sequence models to map utterances directly to SQL with its full
	expressivity, bypassing any intermediate meaning representations. These models
	are immediately deployed online to solicit feedback from real users to flag
	incorrect queries. Finally, the popularity of SQL facilitates gathering
	annotations for incorrect predictions using the crowd, which is directly used
	to improve our models. This complete feedback loop, without intermediate
	representations or database specific engineering, opens up new ways of building
	high quality semantic parsers. Experiments suggest that this approach can be
	deployed quickly for any new target domain, as we show by learning a semantic
	parser for an online academic database from scratch.},
  url       = {http://aclweb.org/anthology/P17-1089}
}

@InProceedings{qin-wang-kim:2017:Long,
  author    = {Qin, Kechen  and  Wang, Lu  and  Kim, Joseph},
  title     = {Joint Modeling of Content and Discourse Relations in Dialogues},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {974--984},
  abstract  = {We present a joint modeling approach to identify salient discussion points in
	spoken meetings as well as to label the discourse relations between speaker
	turns. A variation of our model is also discussed when discourse relations are
	treated as latent variables. Experimental results on two popular meeting
	corpora show that our joint model can outperform state-of-the-art approaches
	for both phrase-based content selection and discourse relation prediction
	tasks. We also evaluate our model on predicting the consistency among team
	members' understanding of their group decisions. Classifiers trained with
	features constructed from our model achieve significant better predictive
	performance than the state-of-the-art.},
  url       = {http://aclweb.org/anthology/P17-1090}
}

@InProceedings{niculae-park-cardie:2017:Long,
  author    = {Niculae, Vlad  and  Park, Joonsuk  and  Cardie, Claire},
  title     = {Argument Mining with Structured SVMs and RNNs},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {985--995},
  abstract  = {We propose a novel factor graph model for argument mining, designed for
	settings in which the argumentative relations in a document do not necessarily
	form a tree structure. (This is the case in over 20% of the web comments
	dataset we release.) Our model jointly learns elementary unit type
	classification and argumentative relation prediction. Moreover, our model
	supports SVM and RNN parametrizations, can enforce structure constraints (e.g.,
	transitivity), and can express dependencies between adjacent relations and
	propositions. Our approaches outperform unstructured baselines in both web
	comments and argumentative essay datasets.},
  url       = {http://aclweb.org/anthology/P17-1091}
}

@InProceedings{ji-smith:2017:Long,
  author    = {Ji, Yangfeng  and  Smith, Noah A.},
  title     = {Neural Discourse Structure for Text Categorization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {996--1005},
  abstract  = {We show that discourse structure, as defined by Rhetorical Structure Theory and
	provided by an existing discourse parser, benefits text categorization.  Our
	approach uses a recursive neural network and a newly proposed attention
	mechanism to compute a representation of the text that focuses on salient
	content, from the perspective of both RST and the task.  Experiments consider
	variants of the approach and illustrate its strengths and weaknesses.},
  url       = {http://aclweb.org/anthology/P17-1092}
}

@InProceedings{qin-EtAl:2017:Long,
  author    = {Qin, Lianhui  and  Zhang, Zhisong  and  Zhao, Hai  and  Hu, Zhiting  and  Xing, Eric},
  title     = {Adversarial Connective-exploiting Networks for Implicit Discourse Relation Classification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1006--1017},
  abstract  = {Implicit discourse relation classification is of great challenge due to the
	lack of connectives as strong linguistic cues, which motivates the use of
	annotated implicit connectives to improve the recognition. We propose a feature
	imitation framework in which an implicit relation network is driven to learn
	from another neural network with access to connectives, and thus encouraged to
	extract similarly salient features for accurate classification. We develop an
	adversarial model to enable an adaptive imitation scheme through competition
	between the implicit network and a rival feature discriminator. Our method
	effectively transfers discriminability of connectives to the implicit features,
	and achieves state-of-the-art performance on the PDTB benchmark.},
  url       = {http://aclweb.org/anthology/P17-1093}
}

@InProceedings{haponchyk-moschitti:2017:Long,
  author    = {Haponchyk, Iryna  and  Moschitti, Alessandro},
  title     = {Don't understand a measure? Learn it: Structured Prediction for Coreference Resolution optimizing its measures},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1018--1028},
  abstract  = {An interesting aspect of structured prediction is the evaluation of an output
	structure against the gold standard. Especially in the loss-augmented setting,
	the need of finding the max-violating constraint has severely limited the
	expressivity of effective loss functions.
	In this paper, we trade off exact computation for enabling the use and study of
	more complex loss functions for coreference resolution. Most interestingly, we
	show that such functions can be (i) automatically learned also from
	controversial but commonly accepted coreference measures, e.g., MELA, and (ii)
	successfully used in learning algorithms. The accurate model comparison on the
	standard CoNLL-2012 setting shows the benefit of more expressive loss
	functions.},
  url       = {http://aclweb.org/anthology/P17-1094}
}

@InProceedings{andrews-EtAl:2017:Long,
  author    = {Andrews, Nicholas  and  Dredze, Mark  and  Van Durme, Benjamin  and  Eisner, Jason},
  title     = {Bayesian Modeling of Lexical Resources for Low-Resource Settings},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1029--1039},
  abstract  = {Lexical resources such as dictionaries and gazetteers are often used
	as auxiliary data for tasks such as part-of-speech induction and named-entity
	recognition. However, discriminative training with lexical features requires
	annotated data to reliably estimate the lexical feature weights and may result
	in overfitting the lexical features at the expense of features which generalize
	better.
	In this paper, we investigate a more robust approach: we stipulate
	that the lexicon is the result of an assumed generative
	process. Practically, this means that we may treat the lexical
	resources as observations under the proposed generative model.
	The lexical resources provide training data for the generative model
	without requiring separate data to estimate lexical feature
	weights. We evaluate the proposed approach in two settings:
	part-of-speech induction and low-resource named-entity recognition.},
  url       = {http://aclweb.org/anthology/P17-1095}
}

@InProceedings{yang-EtAl:2017:Long,
  author    = {Yang, Zhilin  and  Hu, Junjie  and  Salakhutdinov, Ruslan  and  Cohen, William},
  title     = {Semi-Supervised QA with Generative Domain-Adaptive Nets},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1040--1050},
  abstract  = {We study the problem of semi-supervised question answering----utilizing
	unlabeled text to boost the performance of question answering models. We
	propose a novel training framework, the \textit{Generative Domain-Adaptive
	Nets}. In this framework, we train a generative model to generate questions
	based on the unlabeled text, and combine model-generated questions with
	human-generated questions for training question answering models. We develop
	novel domain adaptation algorithms, based on reinforcement learning, to
	alleviate the discrepancy between the model-generated data distribution and the
	human-generated data distribution. Experiments show that our proposed framework
	obtains substantial improvement from unlabeled text.},
  url       = {http://aclweb.org/anthology/P17-1096}
}

@InProceedings{guu-EtAl:2017:Long,
  author    = {Guu, Kelvin  and  Pasupat, Panupong  and  Liu, Evan  and  Liang, Percy},
  title     = {From Language to Programs: Bridging Reinforcement Learning and Maximum Marginal Likelihood},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1051--1062},
  abstract  = {Our goal is to learn a semantic parser that maps natural language utterances
	into executable programs when only indirect supervision is available: examples
	are labeled with the correct execution result, but not the program itself.
	Consequently, we must search the space of programs for those that output the
	correct result, while not being misled by spurious programs: incorrect
	programs that coincidentally output the correct result. We connect two common
	learning paradigms, reinforcement learning (RL) and maximum marginal likelihood
	(MML), and then present a new learning algorithm that combines the strengths of
	both. The new algorithm guards against spurious programs by combining the
	systematic search traditionally employed in MML with the randomized exploration
	of RL, and by updating parameters such that probability is spread more evenly
	across consistent programs. We apply our learning algorithm to a new neural
	semantic parser and show significant gains over existing state-of-the-art
	results on a recent context-dependent semantic parsing task.},
  url       = {http://aclweb.org/anthology/P17-1097}
}

@InProceedings{nema-EtAl:2017:Long,
  author    = {Nema, Preksha  and  Khapra, Mitesh M.  and  Laha, Anirban  and  Ravindran, Balaraman},
  title     = {Diversity driven attention model for query-based abstractive summarization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1063--1072},
  abstract  = {Abstractive summarization aims to generate a shorter version of the document
	covering all the salient points in a compact and coherent fashion. On the other
	hand, query-based summarization highlights those points that are relevant in
	the context of a given query. The encode-attend-decode paradigm has achieved
	notable success in machine translation, extractive summarization, dialog
	systems, etc. But it suffers from the drawback of generation of repeated
	phrases. In this work we propose a model for the query-based summarization task
	based on the encode-attend-decode paradigm with two key additions (i) a query
	attention model (in addition to document attention model) which learns to focus
	on different portions of the query at different time steps (instead of using a
	static representation for the query) and (ii) a new diversity based attention
	model which aims to alleviate the problem of repeating phrases in the summary.
	In order to enable the testing of this model we introduce a new query-based
	summarization dataset building on debatepedia. Our experiments show that with
	these two additions the proposed model clearly outperforms vanilla
	encode-attend-decode models with a gain of 28% (absolute) in ROUGE-L scores.},
  url       = {http://aclweb.org/anthology/P17-1098}
}

@InProceedings{see-liu-manning:2017:Long,
  author    = {See, Abigail  and  Liu, Peter J.  and  Manning, Christopher D.},
  title     = {Get To The Point: Summarization with Pointer-Generator Networks},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1073--1083},
  abstract  = {Neural sequence-to-sequence models have provided a viable new approach for
	abstractive text summarization (meaning they are not restricted to simply
	selecting and rearranging passages from the original text). However, these
	models have two shortcomings: they are liable to reproduce factual details
	inaccurately, and they tend to repeat themselves. In this work we propose a
	novel architecture that augments the standard sequence-to-sequence attentional
	model in two orthogonal ways. First, we use a hybrid pointer-generator network
	that can copy words from the source text via pointing, which aids accurate
	reproduction of information, while retaining the ability to produce novel words
	through the generator. Second, we use coverage to keep track of what has been
	summarized, which discourages repetition. We apply our model to the CNN / Daily
	Mail summarization task, outperforming the current abstractive state-of-the-art
	by at least 2 ROUGE points.},
  url       = {http://aclweb.org/anthology/P17-1099}
}

@InProceedings{peyrard-ecklekohler:2017:Long,
  author    = {Peyrard, Maxime  and  Eckle-Kohler, Judith},
  title     = {Supervised Learning of Automatic Pyramid for Optimization-Based Multi-Document Summarization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1084--1094},
  abstract  = {We present a  new supervised framework that learns to estimate automatic
	Pyramid scores and uses them for optimization-based extractive multi-document
	summarization. For learning automatic Pyramid scores, we developed a method for
	automatic training data generation which is based on a genetic algorithm using
	automatic Pyramid as the fitness function. Our experimental evaluation shows
	that  our new framework significantly outperforms strong baselines regarding
	automatic Pyramid, and that there is much room for improvement in comparison
	with the upper-bound for automatic Pyramid.},
  url       = {http://aclweb.org/anthology/P17-1100}
}

@InProceedings{zhou-EtAl:2017:Long,
  author    = {Zhou, Qingyu  and  Yang, Nan  and  Wei, Furu  and  Zhou, Ming},
  title     = {Selective Encoding for Abstractive Sentence Summarization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1095--1104},
  abstract  = {We propose a selective encoding model to extend the sequence-to-sequence
	framework for abstractive sentence summarization. It consists of a sentence
	encoder, a selective gate network, and an attention equipped decoder. The
	sentence encoder and decoder are built with recurrent neural networks. The
	selective gate network constructs a second level sentence representation by
	controlling the information flow from encoder to decoder. The second level
	representation is tailored for sentence summarization task, which leads to
	better performance. We evaluate our model on the English Gigaword, DUC 2004 and
	MSR abstractive sentence summarization datasets. The experimental results show
	that the proposed selective encoding model outperforms the state-of-the-art
	baseline models.},
  url       = {http://aclweb.org/anthology/P17-1101}
}

@InProceedings{florescu-caragea:2017:Long,
  author    = {Florescu, Corina  and  Caragea, Cornelia},
  title     = {PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1105--1115},
  abstract  = {The large and growing amounts of online scholarly data present both challenges
	and opportunities to enhance knowledge discovery. One such challenge is to
	automatically extract a small set of keyphrases from a document that can
	accurately describe the document's content and can facilitate fast information
	processing. In this paper, we propose PositionRank, an unsupervised 
	model for keyphrase extraction from scholarly documents that incorporates
	information from all positions of a word's occurrences into a biased PageRank.
	Our model obtains remarkable improvements in performance over PageRank models
	that do not take into account word positions as well as over strong baselines
	for this task. 
	Specifically, on several datasets of research papers, PositionRank achieves
	improvements as high as $29.09%$.},
  url       = {http://aclweb.org/anthology/P17-1102}
}

@InProceedings{lowe-EtAl:2017:Long,
  author    = {Lowe, Ryan  and  Noseworthy, Michael  and  Serban, Iulian Vlad  and  Angelard-Gontier, Nicolas  and  Bengio, Yoshua  and  Pineau, Joelle},
  title     = {Towards an Automatic Turing Test: Learning to Evaluate Dialogue Responses},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1116--1126},
  abstract  = {Automatically evaluating the quality of dialogue responses for unstructured
	domains is a challenging problem.  Unfortunately, existing automatic evaluation
	metrics are biased and correlate very poorly with human judgements of response
	quality (Liu et al., 2016). Yet having an accurate automatic evaluation
	procedure is crucial for dialogue research, as it allows rapid prototyping and
	testing of new models with fewer expensive human evaluations. In response to
	this challenge, we formulate automatic dialogue evaluation as a learning
	problem.We present an evaluation model (ADEM)that learns to predict human-like
	scores to input responses, using a new dataset of human response scores.   We
	show that the ADEM model’s predictions correlate significantly,  and at a
	level much higher than word-overlap metrics such as BLEU, with human judgements
	at both the utterance and system-level. We also show that ADEM can generalize
	to evaluating dialogue mod-els unseen during training,                    an
	important step
	for
	automatic dialogue evaluation.},
  url       = {http://aclweb.org/anthology/P17-1103}
}

@InProceedings{hershcovich-abend-rappoport:2017:Long,
  author    = {Hershcovich, Daniel  and  Abend, Omri  and  Rappoport, Ari},
  title     = {A Transition-Based Directed Acyclic Graph Parser for UCCA},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1127--1138},
  abstract  = {We present the first parser for UCCA, a cross-linguistically applicable
	framework for semantic representation, which builds on extensive typological
	work and supports rapid annotation. UCCA poses a challenge for existing parsing
	techniques, as it exhibits reentrancy (resulting in DAG structures),
	discontinuous structures and non-terminal nodes corresponding to complex
	semantic units. To our knowledge, the conjunction of these formal properties is
	not supported by any existing parser. Our transition-based parser, which uses a
	novel transition set and features based on bidirectional LSTMs, has value not
	just for UCCA parsing: its ability to handle more general graph structures can
	inform the development of parsers for other semantic DAG structures, and in
	languages that frequently use discontinuous structures.},
  url       = {http://aclweb.org/anthology/P17-1104}
}

@InProceedings{rabinovich-stern-klein:2017:Long,
  author    = {Rabinovich, Maxim  and  Stern, Mitchell  and  Klein, Dan},
  title     = {Abstract Syntax Networks for Code Generation and Semantic Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1139--1149},
  abstract  = {Tasks like code generation and semantic parsing require mapping unstructured
	(or partially structured) inputs to well-formed, executable outputs. We
	introduce abstract syntax networks, a modeling framework for these problems.
	The outputs are represented as abstract syntax trees (ASTs) and constructed by
	a decoder with a dynamically-determined modular structure paralleling the
	structure of the output tree. On the benchmark Hearthstone dataset for code
	generation, our model obtains 79.2 BLEU and 22.7% exact match accuracy,
	compared to previous state-of-the-art values of 67.1 and 6.1%. Furthermore, we
	perform competitively on the Atis, Jobs, and Geo semantic parsing datasets with
	no task-specific engineering.},
  url       = {http://aclweb.org/anthology/P17-1105}
}

@InProceedings{ding-EtAl:2017:Long,
  author    = {Ding, Yanzhuo  and  Liu, Yang  and  Luan, Huanbo  and  Sun, Maosong},
  title     = {Visualizing and Understanding Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1150--1159},
  abstract  = {While neural machine translation (NMT) has made remarkable progress in recent
	years, it is hard to interpret its internal workings due to the continuous
	representations and non-linearity of neural networks. In this work, we propose
	to use layer-wise relevance propagation (LRP) to compute the contribution of
	each contextual word to arbitrary hidden states in the attention-based
	encoder-decoder framework. We show that visualization with LRP helps to
	interpret the internal workings of NMT and analyze translation errors.},
  url       = {http://aclweb.org/anthology/P17-1106}
}

@InProceedings{rehbein-ruppenhofer:2017:Long,
  author    = {Rehbein, Ines  and  Ruppenhofer, Josef},
  title     = {Detecting annotation noise in automatically labelled data},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1160--1170},
  abstract  = {We introduce a method for error detection in automatically annotated text,
	aimed at supporting the creation of high-quality language resources at
	affordable cost. Our method combines an unsupervised generative model with
	human supervision from active learning. We test our approach on in-domain and
	out-of-domain data in two languages, in AL simulations and in a real world
	setting. For all settings, the results show that our method is able to detect
	annotation errors with high precision and high recall.},
  url       = {http://aclweb.org/anthology/P17-1107}
}

@InProceedings{tan-wan-xiao:2017:Long,
  author    = {Tan, Jiwei  and  Wan, Xiaojun  and  Xiao, Jianguo},
  title     = {Abstractive Document Summarization with a Graph-Based Attentional Neural Model},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1171--1181},
  abstract  = {Abstractive summarization is the ultimate goal of document summarization
	research, but previously it is less investigated due to the immaturity of text
	generation techniques. Recently impressive progress has been made to
	abstractive sentence summarization using neural models. Unfortunately, attempts
	on abstractive document summarization are still in a primitive stage, and the
	evaluation results are worse than extractive methods on benchmark datasets. In
	this paper, we review the difficulties of neural abstractive document
	summarization, and propose a novel graph-based attention mechanism in the
	sequence-to-sequence framework. The intuition is to address the saliency factor
	of summarization, which has been overlooked by prior works. Experimental
	results demonstrate our model is able to achieve considerable improvement over
	previous neural abstractive models. The data-driven neural abstractive method
	is also competitive with state-of-the-art extractive methods.},
  url       = {http://aclweb.org/anthology/P17-1108}
}

@InProceedings{cotterell-eisner:2017:Long,
  author    = {Cotterell, Ryan  and  Eisner, Jason},
  title     = {Probabilistic Typology: Deep Generative Models of Vowel Inventories},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1182--1192},
  abstract  = {Linguistic typology studies the range of structures present in human language.
	The main goal of the field is to discover which sets of possible phenomena are
	universal, and which are merely frequent. For example, all languages have
	vowels, while most---but not all---languages have an /u/ sound. In this paper
	we present the first probabilistic treatment of a basic question in
	phonological typology: What makes a natural vowel inventory?  We introduce a
	series of deep stochastic point processes, and contrast them with previous
	computational, simulation-based approaches.  We provide a comprehensive suite
	of experiments on over 200 distinct languages.},
  url       = {http://aclweb.org/anthology/P17-1109}
}

@InProceedings{chen-EtAl:2017:Long2,
  author    = {Chen, Xinchi  and  Shi, Zhan  and  Qiu, Xipeng  and  Huang, Xuanjing},
  title     = {Adversarial Multi-Criteria Learning for Chinese Word Segmentation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1193--1203},
  abstract  = {Different linguistic perspectives causes many diverse segmentation criteria for
	Chinese word segmentation (CWS). Most existing methods focus on improve the
	performance for each single criterion. However, it is interesting to exploit
	these different criteria and mining their common underlying knowledge. In this
	paper, we propose adversarial multi-criteria learning for CWS by integrating
	shared knowledge from multiple heterogeneous segmentation criteria. 
	Experiments on eight corpora with heterogeneous segmentation criteria show that
	the performance of each corpus obtains a significant improvement, compared to
	single-criterion learning. Source codes of this paper are available on Github.},
  url       = {http://aclweb.org/anthology/P17-1110}
}

@InProceedings{kurita-kawahara-kurohashi:2017:Long,
  author    = {Kurita, Shuhei  and  Kawahara, Daisuke  and  Kurohashi, Sadao},
  title     = {Neural Joint Model for Transition-based Chinese Syntactic Analysis},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1204--1214},
  abstract  = {We present neural network-based joint models for Chinese word segmentation, POS
	tagging and dependency parsing. Our models are the first neural approaches for
	fully joint Chinese analysis that is known to prevent the error propagation
	problem of pipeline models. Although word embeddings play a key role in
	dependency parsing, they cannot be applied directly to the joint task in the
	previous work. To address this problem, we propose embeddings of character
	strings, in addition to words. Experiments show that our models outperform
	existing systems in Chinese word segmentation and POS tagging, and perform
	preferable accuracies in dependency parsing. We also explore bi-LSTM models
	with fewer features.},
  url       = {http://aclweb.org/anthology/P17-1111}
}

@InProceedings{buys-blunsom:2017:Long,
  author    = {Buys, Jan  and  Blunsom, Phil},
  title     = {Robust Incremental Neural Semantic Graph Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1215--1226},
  abstract  = {Parsing sentences to linguistically-expressive semantic representations is a
	key goal of Natural Language Processing. Yet statistical parsing has focussed
	almost exclusively on bilexical dependencies or domain-specific logical forms.
	We propose a neural encoder-decoder transition-based parser which is the first
	full-coverage semantic graph parser for Minimal Recursion Semantics (MRS).
	The model architecture uses stack-based embedding features, predicting graphs
	jointly with unlexicalized predicates and their token alignments. Our parser
	is more accurate than attention-based baselines on MRS, and on an additional
	Abstract Meaning Representation (AMR) benchmark, and GPU batch processing
	makes it an order of magnitude faster than a high-precision grammar-based
	parser. Further, the 86.69% Smatch score of our MRS parser is higher than the
	upper-bound on AMR parsing, making MRS an attractive choice as a semantic
	representation.},
  url       = {http://aclweb.org/anthology/P17-1112}
}

@InProceedings{zheng-EtAl:2017:Long,
  author    = {Zheng, Suncong  and  Wang, Feng  and  Bao, Hongyun  and  Hao, Yuexing  and  Zhou, Peng  and  Xu, Bo},
  title     = {Joint Extraction of Entities and Relations Based on a Novel Tagging Scheme},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1227--1236},
  abstract  = {Joint extraction of entities and relations is an important task in information
	extraction. To tackle this problem, we firstly propose a novel tagging scheme
	that can convert the joint extraction task to a tagging problem.. Then, based
	on our tagging scheme, we study different end-to-end models to extract entities
	and their relations directly, without identifying entities and relations
	separately. We conduct experiments on a public dataset produced by distant
	supervision method and the experimental results show that the tagging based
	methods are better than most of the existing pipelined and joint learning
	methods. What’s more, the end-to-end model proposed in this paper, achieves
	the best results on the public dataset.},
  url       = {http://aclweb.org/anthology/P17-1113}
}

@InProceedings{xu-jiang-watcharawittayakul:2017:Long,
  author    = {Xu, Mingbin  and  Jiang, Hui  and  Watcharawittayakul, Sedtawut},
  title     = {A Local Detection Approach for Named Entity Recognition and Mention Detection},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1237--1247},
  abstract  = {In this paper, we study a novel approach for named entity recognition (NER) and
	mention detection (MD) in natural language processing. Instead of treating NER
	as a sequence labeling problem, we propose a new local detection approach,
	which relies on the recent fixed-size ordinally forgetting encoding (FOFE)
	method to fully encode each sentence fragment and its left/right contexts into
	a fixed-size representation. Subsequently, a simple feedforward neural network
	(FFNN) is learned to either reject or predict entity label for each individual
	text fragment. The proposed method has been evaluated in several popular NER
	and MD tasks, including CoNLL 2003 NER task and  TAC-KBP2015 and TAC-KBP2016
	Tri-lingual Entity Discovery and Linking (EDL) tasks. Our method has yielded
	pretty strong performance in all of these examined tasks. This local detection
	approach has shown many advantages over the traditional sequence labeling
	methods.},
  url       = {http://aclweb.org/anthology/P17-1114}
}

@InProceedings{gritta-EtAl:2017:Long,
  author    = {Gritta, Milan  and  Pilehvar, Mohammad Taher  and  Limsopatham, Nut  and  Collier, Nigel},
  title     = {Vancouver Welcomes You! Minimalist Location Metonymy Resolution},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1248--1259},
  abstract  = {Named entities are frequently used in a metonymic manner. They serve as
	references to related entities such as people and organisations. Accurate
	identification and interpretation of metonymy can be directly beneficial to
	various NLP applications, such as Named Entity Recognition and Geographical
	Parsing. Until now, metonymy resolution (MR) methods mainly relied on parsers,
	taggers, dictionaries, external word lists and other handcrafted lexical
	resources. We show how a minimalist neural approach combined with a novel
	predicate window method can achieve competitive results on the SemEval 2007
	task on Metonymy Resolution. Additionally, we contribute with a new
	Wikipedia-based MR dataset called RelocaR, which is tailored towards locations
	as well as improving previous deficiencies in annotation guidelines.},
  url       = {http://aclweb.org/anthology/P17-1115}
}

@InProceedings{miura-EtAl:2017:Long,
  author    = {Miura, Yasuhide  and  Taniguchi, Motoki  and  Taniguchi, Tomoki  and  Ohkuma, Tomoko},
  title     = {Unifying Text, Metadata, and User Network Representations with a Neural Network for Geolocation Prediction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1260--1272},
  abstract  = {We propose a novel geolocation prediction model using a complex neural network.
	Geolocation prediction in social media has attracted many researchers to use
	information of various types. Our model unifies text, metadata, and user
	network representations with an attention mechanism to overcome previous
	ensemble approaches. In an evaluation using two open datasets, the proposed
	model exhibited a maximum 3.8% increase in accuracy and a maximum of 6.6%
	increase in accuracy$@$161 against previous models. We further analyzed several
	intermediate layers of our model, which revealed that their states capture some
	statistical characteristics of the datasets.},
  url       = {http://aclweb.org/anthology/P17-1116}
}

@InProceedings{pasunuru-bansal:2017:Long,
  author    = {Pasunuru, Ramakanth  and  Bansal, Mohit},
  title     = {Multi-Task Video Captioning with Video and Entailment Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1273--1283},
  abstract  = {Video captioning, the task of describing the content of a video, has seen some
	promising improvements in recent years with sequence-to-sequence models, but
	accurately learning the temporal and logical dynamics involved in the task
	still remains a challenge, especially given the lack of sufficient annotated
	data. We improve video captioning by sharing knowledge with two related
	directed-generation tasks: a temporally-directed unsupervised video prediction
	task to learn richer context-aware video encoder representations, and a
	logically-directed language entailment generation task to learn better
	video-entailing caption decoder representations. For this, we present a
	many-to-many multi-task learning model that shares parameters across the
	encoders and decoders of the three tasks. We achieve significant improvements
	and the new state-of-the-art on several standard video captioning datasets
	using diverse automatic and human evaluations. We also show mutual multi-task
	improvements on the entailment generation task.},
  url       = {http://aclweb.org/anthology/P17-1117}
}

@InProceedings{santos-EtAl:2017:Long,
  author    = {Santos, Leandro  and  Corr\^{e}a J\'{u}nior, Edilson Anselmo  and  Oliveira Jr, Osvaldo  and  Amancio, Diego  and  Mansur, Let\'{i}cia  and  Alu\'{i}sio, Sandra},
  title     = {Enriching Complex Networks with Word Embeddings for Detecting Mild Cognitive Impairment from Speech Transcripts},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1284--1296},
  abstract  = {Mild Cognitive Impairment (MCI) is a mental disorder difficult to diagnose.
	Linguistic features, mainly from parsers, have been used to detect MCI, but
	this is not suitable for large-scale assessments. MCI disfluencies produce
	non-grammatical speech that requires manual or high precision automatic
	correction of transcripts.  In this paper, we modeled transcripts into complex
	networks and enriched them with word embedding (CNE) to better represent short
	texts produced in neuropsychological assessments. The network measurements were
	applied with well-known classifiers to automatically identify MCI in
	transcripts, in a binary classification task. A comparison was made with the
	performance of traditional approaches using Bag of Words (BoW) and linguistic
	features for three datasets: DementiaBank in English, and Cinderella and
	Arizona-Battery in Portuguese. Overall, CNE provided higher accuracy than using
	only complex networks, while Support Vector Machine was superior to other
	classifiers. CNE provided the highest accuracies for DementiaBank and
	Cinderella, but BoW was more efficient for the Arizona-Battery dataset probably
	owing to its short narratives. The approach using linguistic features yielded
	higher accuracy if the transcriptions of the Cinderella dataset were manually
	revised. Taken together, the results indicate that complex networks enriched
	with embedding is promising for detecting MCI in large-scale assessments.},
  url       = {http://aclweb.org/anthology/P17-1118}
}

@InProceedings{kim-stratos-kim:2017:Long2,
  author    = {Kim, Young-Bum  and  Stratos, Karl  and  Kim, Dongchan},
  title     = {Adversarial Adaptation of Synthetic or Stale Data},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1297--1307},
  abstract  = {Two types of data shift common in practice are 1. transferring from synthetic
	data to live user data (a deployment shift), and
	2. transferring from stale data to current data (a temporal shift). Both cause
	a distribution mismatch between training and evaluation, leading to a model
	that overfits the flawed training data and performs poorly on the test data. We
	propose a solution to this mismatch problem by framing it as domain adaptation,
	treating the flawed training dataset as a source domain and
	the evaluation dataset as a target domain. To this end, we use and build on
	several recent advances in neural domain adaptation such as adversarial
	training (Ganinet al., 2016) and domain separation network (Bousmalis et al.,
	2016), proposing a new effective adversarial training scheme. In both
	supervised and unsupervised adaptation scenarios, our approach yields clear
	improvement over strong baselines.},
  url       = {http://aclweb.org/anthology/P17-1119}
}

@InProceedings{akasaki-kaji:2017:Long,
  author    = {Akasaki, Satoshi  and  Kaji, Nobuhiro},
  title     = {Chat Detection in an Intelligent Assistant: Combining Task-oriented and Non-task-oriented Spoken Dialogue Systems},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1308--1319},
  abstract  = {Recently emerged intelligent assistants on smartphones and home electronics
	(e.g., Siri and Alexa) can be seen as novel hybrids of domain-specific
	task-oriented spoken dialogue systems and open-domain non-task-oriented ones.
	To realize such hybrid dialogue systems, this paper investigates determining
	whether or not a user is going to have a chat with the system. To address the
	lack of benchmark datasets for this task, we construct a new dataset consisting
	of 15,160 utterances collected from the real log data of a commercial
	intelligent assistant (and will release the dataset to facilitate future
	research activity). In addition, we investigate using tweets and Web search
	queries for handling open-domain user utterances, which characterize the task
	of chat detection. Experimental experiments demonstrated that, while simple
	supervised methods are effective, the use of the tweets and search queries
	further improves the F$\_1$-score from 86.21 to 87.53.},
  url       = {http://aclweb.org/anthology/P17-1120}
}

@InProceedings{tiennguyen-joty:2017:Long,
  author    = {Tien Nguyen, Dat  and  Joty, Shafiq},
  title     = {A Neural Local Coherence Model},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1320--1330},
  abstract  = {We propose a local coherence model based on a convolutional neural network that
	operates over the entity grid representation of a text. The model captures long
	range en- tity transitions along with entity-specific features without loosing
	generalization, thanks to the power of distributed representation. We present a
	pairwise ranking method to train the model in an end-to-end fashion on a task
	and learn task-specific high level features. Our evaluation on three different
	coherence assessment tasks demonstrates that our model achieves state of the
	art results outperforming existing models by a good margin.},
  url       = {http://aclweb.org/anthology/P17-1121}
}

@InProceedings{cagan-frank-tsarfaty:2017:Long,
  author    = {Cagan, Tomer  and  Frank, Stefan L.  and  Tsarfaty, Reut},
  title     = {Data-Driven Broad-Coverage Grammars for Opinionated Natural Language Generation (ONLG)},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1331--1341},
  abstract  = {Opinionated Natural Language Generation (ONLG) is a new, challenging, task that
	aims to automatically generate human-like, subjective, responses to opinionated
	articles online. 
	We present a data-driven architecture for ONLG that generates subjective
	responses triggered by users’ agendas, consisting of topics and sentiments,
	and based on wide-coverage automatically-acquired generative grammars.
	We compare three types of grammatical representations that we design for ONLG,
	which interleave different layers of linguistic information and are induced
	from a new, enriched dataset we developed.
	Our evaluation shows that generation with Relational-Realizational (Tsarfaty
	and Sima’an, 2008) inspired grammar gets better language model scores than
	lexicalized grammars `a la Collins (2003), and that the latter gets better
	human-evaluation scores. 
	We also show that conditioning the generation on topic models makes generated
	responses more relevant to the document content.},
  url       = {http://aclweb.org/anthology/P17-1122}
}

@InProceedings{du-shao-cardie:2017:Long,
  author    = {Du, Xinya  and  Shao, Junru  and  Cardie, Claire},
  title     = {Learning to Ask: Neural Question Generation for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1342--1352},
  abstract  = {We study automatic question generation for sentences from text passages in
	reading comprehension. We introduce an attention-based sequence learning model
	for the task and investigate the effect of encoding sentence- vs.
	paragraph-level information. In contrast to all previous work, our model does
	not rely on hand-crafted rules or a sophisticated NLP pipeline;  it is instead
	trainable end-to-end via sequence-to-sequence learning. Automatic evaluation
	results show that our system significantly outperforms the state-of-the-art
	rule-based system. In human evaluations, questions generated by our system are
	also rated as being more natural (\ie, grammaticality, fluency) and as more
	difficult to answer (in terms of syntactic and lexical divergence from the
	original text and reasoning needed to answer).},
  url       = {http://aclweb.org/anthology/P17-1123}
}

@InProceedings{pvs-meyer:2017:Long,
  author    = {PVS, Avinesh  and  Meyer, Christian M.},
  title     = {Joint Optimization of User-desired Content in Multi-document Summaries by Learning from User Feedback},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1353--1363},
  abstract  = {In this paper, we propose an extractive multi-document summarization (MDS)
	system using joint optimization and active learning for content selection
	grounded in user feedback. Our method interactively obtains user feedback to
	gradually improve the results of a state-of-the-art integer linear programming
	(ILP) framework for MDS. Our methods complement fully automatic methods in
	producing high-quality summaries with a minimum number of iterations and
	feedbacks.
	We conduct multiple simulation-based experiments and analyze the effect of
	feedback-based concept selection in the ILP setup in order to maximize the
	user-desired content in the summary.},
  url       = {http://aclweb.org/anthology/P17-1124}
}

@InProceedings{zhang-EtAl:2017:Long1,
  author    = {Zhang, Jiyuan  and  Feng, Yang  and  Wang, Dong  and  Wang, Yang  and  Abel, Andrew  and  Zhang, Shiyue  and  Zhang, Andi},
  title     = {Flexible and Creative Chinese Poetry Generation Using Neural Memory},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1364--1373},
  abstract  = {It has been shown that Chinese poems can be successfully generated by
	sequence-to-sequence neural models, particularly with the attention mechanism.
	A potential problem of this approach, however, is that neural models can only
	learn abstract rules, while poem generation is a highly creative process that
	involves not only rules but also innovations for which pure statistical models
	are not appropriate in principle. This work proposes a memory augmented neural
	model for Chinese poem generation, where the neural model and the augmented
	memory work together to balance the requirements of linguistic accordance and
	aesthetic innovation, leading to innovative generations that are still
	rule-compliant. In addition, it is found that the memory mechanism provides
	interesting flexibility that can be used to generate poems with different
	styles.},
  url       = {http://aclweb.org/anthology/P17-1125}
}

@InProceedings{murakami-EtAl:2017:Long,
  author    = {Murakami, Soichiro  and  Watanabe, Akihiko  and  Miyazawa, Akira  and  Goshima, Keiichi  and  Yanase, Toshihiko  and  Takamura, Hiroya  and  Miyao, Yusuke},
  title     = {Learning to Generate Market Comments from Stock Prices},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1374--1384},
  abstract  = {This paper presents a novel encoder-decoder model for automatically generating
	market comments from stock prices. The model first encodes both short- and
	long-term series of stock prices so that it can mention short- and long-term
	changes in stock prices. In the decoding phase, our model can also generate a
	numerical value by selecting an appropriate arithmetic operation such as
	subtraction or rounding, and applying it to the input stock prices. Empirical
	experiments show that our best model generates market comments at the fluency
	and the informativeness approaching human-generated reference texts.},
  url       = {http://aclweb.org/anthology/P17-1126}
}

@InProceedings{wang-EtAl:2017:Long4,
  author    = {Wang, Liangguo  and  Jiang, Jing  and  Chieu, Hai Leong  and  Ong, Chen Hui  and  Song, Dandan  and  Liao, Lejian},
  title     = {Can Syntax Help? Improving an LSTM-based Sentence Compression Model for New Domains},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1385--1393},
  abstract  = {In this paper, we study how to improve the
	domain adaptability of a deletion-based
	Long Short-Term Memory (LSTM) neural network model for sentence compression. We
	hypothesize that syntactic information helps in making such models
	more robust across domains. We propose two major changes to the model: using
	explicit syntactic features and introducing syntactic constraints through
	Integer Linear Programming (ILP). Our evaluation
	shows that the proposed model works better than the original model as well as a
	traditional non-neural-network-based model
	in a cross-domain setting.},
  url       = {http://aclweb.org/anthology/P17-1127}
}

@InProceedings{wang-EtAl:2017:Long5,
  author    = {Wang, Chengyu  and  Yan, Junchi  and  Zhou, Aoying  and  He, Xiaofeng},
  title     = {Transductive Non-linear Learning for Chinese Hypernym Prediction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1394--1404},
  abstract  = {Finding the correct hypernyms for entities is essential for taxonomy learning,
	fine-grained entity categorization, query understanding, etc. Due to the
	flexibility of the Chinese language, it is challenging to identify hypernyms
	in Chinese accurately. Rather than extracting hypernyms from texts, in this
	paper, we present a transductive learning approach to establish mappings from
	entities to hypernyms in the embedding space directly. It combines linear and
	non-linear embedding projection models, with the capacity of
	encoding arbitrary language-specific rules. Experiments on real-world datasets
	illustrate that our approach outperforms previous methods for Chinese hypernym
	prediction.},
  url       = {http://aclweb.org/anthology/P17-1128}
}

@InProceedings{xie-xing:2017:Long,
  author    = {Xie, Pengtao  and  Xing, Eric},
  title     = {A Constituent-Centric Neural Architecture for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1405--1414},
  abstract  = {Reading comprehension (RC), aiming to understand natural texts and answer
	questions therein, is a challenging task. In this paper, we study the RC
	problem on the Stanford Question Answering Dataset (SQuAD). Observing from the
	training set that most correct answers are centered around constituents in the
	parse tree, we design a constituent-centric neural architecture where the
	generation of candidate answers and their representation learning are both
	based on constituents and guided by the parse tree. Under this architecture,
	the search space of candidate answers can be greatly reduced without
	sacrificing the coverage of correct answers and the syntactic, hierarchical and
	compositional structure among constituents can be well captured, which
	contributes to better representation learning of the candidate answers. On
	SQuAD, our method achieves the state of the art performance and the ablation
	study corroborates the effectiveness of individual modules.},
  url       = {http://aclweb.org/anthology/P17-1129}
}

@InProceedings{xu-yang:2017:Long,
  author    = {Xu, Ruochen  and  Yang, Yiming},
  title     = {Cross-lingual Distillation for Text Classification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1415--1425},
  abstract  = {Cross-lingual text classification(CLTC) is the task of classifying documents
	written in different languages into the same taxonomy of categories. 
	This paper presents a novel approach to CLTC that builds on model distillation,
	which adapts and extends a framework originally proposed for model compression.
	Using soft probabilistic predictions for the documents in a label-rich language
	as the (induced) supervisory labels in a parallel corpus of documents, we train
	classifiers successfully for new languages in which labeled training data are
	not available. An adversarial feature adaptation technique is also applied
	during the model training to reduce distribution mismatch. We conducted
	experiments on two benchmark CLTC datasets, treating English as the source
	language and German, French, Japan and Chinese as the unlabeled target
	languages. The proposed approach had the advantageous or comparable performance
	of the other state-of-art methods.},
  url       = {http://aclweb.org/anthology/P17-1130}
}

@InProceedings{perezrosas-EtAl:2017:Long,
  author    = {P\'{e}rez-Rosas, Ver\'{o}nica  and  Mihalcea, Rada  and  Resnicow, Kenneth  and  Singh, Satinder  and  An, Lawrence},
  title     = {Understanding and Predicting Empathic Behavior in Counseling Therapy},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1426--1435},
  abstract  = {Counselor empathy is associated with better outcomes in psychology and
	behavioral counseling. In this paper, we explore several aspects pertaining to
	counseling interaction dynamics and their relation to counselor empathy during
	motivational interviewing encounters. Particularly, we analyze aspects such as
	participants' engagement, participants' verbal and nonverbal accommodation, as
	well as topics being discussed during the conversation, with the final goal of
	identifying linguistic and acoustic markers of counselor empathy. We also show
	how we can use these findings alongside other raw linguistic and acoustic
	features to  build accurate counselor empathy classifiers with accuracies of up
	to 80%.},
  url       = {http://aclweb.org/anthology/P17-1131}
}

@InProceedings{yang-mitchell:2017:Long,
  author    = {Yang, Bishan  and  Mitchell, Tom},
  title     = {Leveraging Knowledge Bases in LSTMs for Improving Machine Reading},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1436--1446},
  abstract  = {This paper focuses on how to take advantage of external knowledge bases (KBs)
	to improve recurrent neural networks for machine reading. Traditional methods
	that exploit knowledge from KBs encode knowledge as discrete indicator
	features. Not only do these features generalize poorly, but they require
	task-specific feature engineering to achieve good performance. We propose
	KBLSTM, a novel neural model that leverages continuous representations of KBs
	to enhance the learning of recurrent neural networks for machine reading. To
	effectively integrate background knowledge with information from the currently
	processed text, our model employs an attention mechanism with a sentinel to
	adaptively decide whether to attend to background knowledge and which
	information from KBs is useful. Experimental results show that our model
	achieves accuracies that surpass the previous state-of-the-art results for both
	entity extraction and event extraction on the widely used ACE2005 dataset.},
  url       = {http://aclweb.org/anthology/P17-1132}
}

@InProceedings{pan-EtAl:2017:Long1,
  author    = {Pan, Liangming  and  Li, Chengjiang  and  Li, Juanzi  and  Tang, Jie},
  title     = {Prerequisite Relation Learning for Concepts in MOOCs},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1447--1456},
  abstract  = {What prerequisite knowledge should students achieve a level of mastery before
	moving forward to learn subsequent coursewares? We study the extent to which
	the prerequisite relation between knowledge concepts in Massive Open Online
	Courses (MOOCs) can be inferred automatically. In particular, what kinds of
	information can be leverage to uncover the potential prerequisite relation
	between knowledge concepts. We first propose a representation learning-based
	method for learning latent representations of course concepts, and then
	investigate how different features capture the prerequisite relations between
	concepts. Our experiments on three datasets form Coursera show that the
	proposed method achieves significant improvements (+5.9-48.0% by F1-score)
	comparing with existing methods.},
  url       = {http://aclweb.org/anthology/P17-1133}
}

@InProceedings{malmasi-EtAl:2017:Long,
  author    = {Malmasi, Shervin  and  Dras, Mark  and  Johnson, Mark  and  Du, Lan  and  Wolska, Magdalena},
  title     = {Unsupervised Text Segmentation Based on Native Language Characteristics},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1457--1469},
  abstract  = {Most work on segmenting text does so on the basis of topic changes,
	but it can be of interest to segment by other, stylistically expressed
	characteristics such as change of authorship or native language.  We
	propose a Bayesian unsupervised text segmentation approach to the
	latter.  While baseline models achieve essentially random segmentation
	on our task, indicating its difficulty, a Bayesian model that
	incorporates appropriately compact language models and alternating
	asymmetric priors can achieve scores on the standard metrics around
	halfway to perfect segmentation.},
  url       = {http://aclweb.org/anthology/P17-1134}
}

@InProceedings{ni-dinu-florian:2017:Long,
  author    = {Ni, Jian  and  Dinu, Georgiana  and  Florian, Radu},
  title     = {Weakly Supervised Cross-Lingual Named Entity Recognition via Effective Annotation and Representation Projection},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1470--1480},
  abstract  = {The state-of-the-art named entity recognition (NER) systems are supervised
	machine learning models that require large amounts of manually annotated data
	to achieve high accuracy. However, annotating NER data by human is expensive
	and time-consuming, and can be quite difficult for a new language. In this
	paper, we present two weakly supervised approaches for cross-lingual NER with
	no human annotation in a target language. The first approach is to create
	automatically labeled NER data for a target language via annotation projection
	on comparable corpora, where we develop a heuristic scheme that effectively
	selects good-quality projection-labeled data from noisy data. The second
	approach is to project distributed representations of words (word embeddings)
	from a target language to a source language, so that the source-language NER
	system can be applied to the target language without re-training. We also
	design two co-decoding schemes that effectively combine the outputs of the two
	projection-based approaches. We evaluate the performance of the proposed
	approaches on both in-house and open NER data for several target languages. The
	results show that the combined systems outperform three other weakly supervised
	approaches on the CoNLL data.},
  url       = {http://aclweb.org/anthology/P17-1135}
}

@InProceedings{chakrabarty-pandit-garain:2017:Long,
  author    = {Chakrabarty, Abhisek  and  Pandit, Onkar Arun  and  Garain, Utpal},
  title     = {Context Sensitive Lemmatization Using Two Successive Bidirectional Gated Recurrent Networks},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1481--1491},
  abstract  = {We introduce a composite deep neural network architecture for supervised and
	language independent context sensitive lemmatization. The proposed method
	considers the task as to identify the correct edit tree representing the
	transformation between a word-lemma pair. To find the lemma of a surface word,
	we exploit two successive bidirectional gated recurrent structures - the first
	one is used to extract the character level dependencies and the next one
	captures the contextual information of the given word. The key advantages of
	our model compared to the state-of-the-art lemmatizers such as Lemming and
	Morfette are - (i) it is independent of human decided features (ii) except the
	gold lemma, no other expensive morphological attribute is required for joint
	learning. We evaluate the lemmatizer on nine languages - Bengali, Catalan,
	Dutch, Hindi, Hungarian, Italian, Latin, Romanian and Spanish. It is found that
	except Bengali, the proposed method outperforms Lemming and Morfette on the
	other languages. To train the model on Bengali, we develop a gold lemma
	annotated dataset (having 1,702 sentences with a total of 20,257 word tokens),
	which is an additional contribution of this work.},
  url       = {http://aclweb.org/anthology/P17-1136}
}

@InProceedings{kawakami-dyer-blunsom:2017:Long,
  author    = {Kawakami, Kazuya  and  Dyer, Chris  and  Blunsom, Phil},
  title     = {Learning to Create and Reuse Words in Open-Vocabulary Neural Language Modeling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1492--1502},
  abstract  = {Fixed-vocabulary language models fail to account for one of the most
	characteristic statistical facts of natural language: the frequent creation and
	reuse of new word types. Although character-level language models offer a
	partial solution in that they can create word types not attested in the
	training corpus, they do not capture the ``bursty'' distribution of such words.
	In this paper, we augment a hierarchical LSTM language model that generates
	sequences of word tokens character by character with a caching mechanism that
	learns to reuse previously generated words. 
	To validate our model we construct a new open-vocabulary language modeling
	corpus (the Multilingual Wikipedia Corpus; MWC) from comparable Wikipedia
	articles in 7 typologically diverse languages and demonstrate the effectiveness
	of our model across this range of languages.},
  url       = {http://aclweb.org/anthology/P17-1137}
}

@InProceedings{kreutzer-sokolov-riezler:2017:Long,
  author    = {Kreutzer, Julia  and  Sokolov, Artem  and  Riezler, Stefan},
  title     = {Bandit Structured Prediction for Neural Sequence-to-Sequence Learning},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1503--1513},
  abstract  = {Bandit structured prediction describes a stochastic optimization framework
	where learning is performed from partial feedback. This feedback is received in
	the form of a task loss evaluation to a predicted output structure, without
	having access to gold standard structures. We advance this framework by lifting
	linear bandit learning to neural sequence-to-sequence learning problems using
	attention-based recurrent neural networks. Furthermore, we show how to
	incorporate control variates into our learning algorithms for variance
	reduction and improved generalization. We present an evaluation on a neural
	machine translation task that shows improvements of up to 5.89 BLEU points for
	domain adaptation from simulated bandit feedback.},
  url       = {http://aclweb.org/anthology/P17-1138}
}

@InProceedings{zhang-EtAl:2017:Long2,
  author    = {Zhang, Jiacheng  and  Liu, Yang  and  Luan, Huanbo  and  Xu, Jingfang  and  Sun, Maosong},
  title     = {Prior Knowledge Integration for Neural Machine Translation using Posterior Regularization},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1514--1523},
  abstract  = {Although neural machine translation has made significant progress recently, how
	to integrate multiple overlapping, arbitrary prior knowledge sources remains a
	challenge. In this work, we propose to use posterior regularization to provide
	a general framework for integrating prior knowledge into neural machine
	translation. We represent prior knowledge sources as features in a log-linear
	model, which guides the learning processing of the neural translation model.
	Experiments on Chinese-English dataset show that our approach leads to
	significant improvements.},
  url       = {http://aclweb.org/anthology/P17-1139}
}

@InProceedings{zhang-EtAl:2017:Long3,
  author    = {Zhang, Jinchao  and  Wang, Mingxuan  and  Liu, Qun  and  Zhou, Jie},
  title     = {Incorporating Word Reordering Knowledge into Attention-based Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1524--1534},
  abstract  = {This paper proposes three distortion models to explicitly incorporate the word
	reordering knowledge into attention-based Neural Machine Translation (NMT) for
	further improving translation performance. Our proposed models enable attention
	mechanism to attend to source words regarding both the semantic requirement and
	the word reordering penalty. Experiments on Chinese-English translation show
	that the approaches can improve word alignment quality and achieve significant
	translation improvements over a basic attention-based NMT by large margins.
	Compared with previous works on identical corpora, our system achieves the
	state-of-the-art performance on translation quality.},
  url       = {http://aclweb.org/anthology/P17-1140}
}

@InProceedings{hokamp-liu:2017:Long,
  author    = {Hokamp, Chris  and  Liu, Qun},
  title     = {Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1535--1546},
  abstract  = {We present Grid Beam Search (GBS), an algorithm which extends beam search
	to allow the inclusion of pre-specified lexical constraints. The algorithm can
	be used with any model which generates sequences token by token. Lexical
	constraints take the form of phrases or words that must be present in the
	output sequence. This is a very general way to incorporate auxillary knowledge
	into a model's output without requiring any modification of the parameters or
	training data. We demonstrate the feasibility and flexibility of
	Lexically Constrained Decoding by conducting experiments on Neural
	Interactive-Predictive Translation, as well as Domain Adaptation for Neural
	Machine Translation. Experiments show that GBS can provide large improvements
	in translation quality in interactive scenarios, and that, even without any
	user input, GBS can be used to achieve significant gains in performance in
	domain adaptation scenarios.},
  url       = {http://aclweb.org/anthology/P17-1141}
}

@InProceedings{tong-EtAl:2017:Long,
  author    = {Tong, Edmund  and  Zadeh, Amir  and  Jones, Cara  and  Morency, Louis-Philippe},
  title     = {Combating Human Trafficking with Multimodal Deep Models},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1547--1556},
  abstract  = {Human trafficking is a global epidemic affecting millions of people across the
	planet. Sex trafficking, the dominant form of human trafficking, has seen a
	significant rise mostly due to the abundance of escort websites, where human
	traffickers can openly advertise among at-will escort advertisements. In this
	paper, we take a major step in the automatic detection of advertisements
	suspected to pertain to human trafficking. We present a novel dataset called
	Trafficking-10k, with more than 10,000~advertisements annotated for this task.
	The dataset contains two sources of information per advertisement: text and
	images. For the accurate detection of trafficking advertisements, we designed
	and trained a deep multimodal model called the Human Trafficking Deep Network
	(HTDN).},
  url       = {http://aclweb.org/anthology/P17-1142}
}

@InProceedings{lim-EtAl:2017:Long,
  author    = {Lim, Swee Kiat  and  Muis, Aldrian Obaja  and  Lu, Wei  and  Ong, Chen Hui},
  title     = {MalwareTextDB: A Database for Annotated Malware Articles},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1557--1567},
  abstract  = {Cybersecurity risks and malware threats are becoming increasingly dangerous and
	common. Despite the severity of the problem, there has been few NLP efforts
	focused on tackling cybersecurity.
	In this paper, we discuss the construction of a new database for annotated
	malware texts. An annotation framework is introduced based on the MAEC
	vocabulary for defining malware characteristics, along with a database
	consisting of 39 annotated APT reports with a total of 6,819 sentences. We also
	use the database to construct models that can potentially help cybersecurity
	researchers in their data collection and analytics efforts.},
  url       = {http://aclweb.org/anthology/P17-1143}
}

@InProceedings{zhang-EtAl:2017:Long4,
  author    = {Zhang, Fan  and  B. Hashemi, Homa  and  Hwa, Rebecca  and  Litman, Diane},
  title     = {A Corpus of Annotated Revisions for Studying Argumentative Writing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1568--1578},
  abstract  = {This paper presents ArgRewrite, a corpus of between-draft revisions of
	argumentative essays. Drafts are manually aligned at the sentence level, and
	the writer’s purpose for each revision is annotated with categories analogous
	to those used in argument mining and discourse analysis. The corpus should
	enable advanced research in writing comparison and revision analysis, as
	demonstrated via our own studies of student revision behavior and of automatic
	revision purpose prediction.},
  url       = {http://aclweb.org/anthology/P17-1144}
}

@InProceedings{ustalov-panchenko-biemann:2017:Long,
  author    = {Ustalov, Dmitry  and  Panchenko, Alexander  and  Biemann, Chris},
  title     = {Watset: Automatic Induction of Synsets from a Graph of Synonyms},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1579--1590},
  abstract  = {This paper presents a new graph-based approach that induces synsets using
	synonymy dictionaries and word embeddings. First, we build a weighted graph of
	synonyms extracted from commonly available resources, such as Wiktionary.
	Second, we apply word sense induction to deal with ambiguous words. Finally, we
	cluster the disambiguated version of the ambiguous input graph into synsets.
	Our meta-clustering approach lets us use an efficient hard clustering algorithm
	to perform a fuzzy clustering of the graph. Despite its simplicity, our
	approach shows excellent results, outperforming five competitive
	state-of-the-art methods in terms of F-score on three gold standard datasets
	for English and Russian derived from large-scale manually constructed lexical
	resources.},
  url       = {http://aclweb.org/anthology/P17-1145}
}

@InProceedings{ouchi-shindo-matsumoto:2017:Long,
  author    = {Ouchi, Hiroki  and  Shindo, Hiroyuki  and  Matsumoto, Yuji},
  title     = {Neural Modeling of Multi-Predicate Interactions for Japanese Predicate Argument Structure Analysis},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1591--1600},
  abstract  = {The performance of Japanese predicate argument structure (PAS) analysis has
	improved in recent years thanks to the joint modeling of interactions between
	multiple predicates. However, this approach relies heavily on syntactic
	information predicted by parsers, and suffers from errorpropagation. To remedy
	this problem, we
	introduce a model that uses grid-type recurrent neural networks. The proposed
	model automatically induces features sensitive to multi-predicate interactions
	from
	the word sequence information of a sentence. Experiments on the NAIST Text
	Corpus demonstrate that without syntactic information, our model outperforms
	previous syntax-dependent models.},
  url       = {http://aclweb.org/anthology/P17-1146}
}

@InProceedings{joshi-EtAl:2017:Long,
  author    = {Joshi, Mandar  and  Choi, Eunsol  and  Weld, Daniel  and  Zettlemoyer, Luke},
  title     = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1601--1611},
  abstract  = {We present TriviaQA, a challenging reading comprehension dataset containing
	over 650K question-answer-evidence triples. TriviaQA includes 95K
	question-answer  pairs authored by trivia enthusiasts and independently
	gathered evidence documents, six per question on average, that provide high
	quality distant supervision for answering the questions. We show that, in
	comparison to other recently introduced large-scale datasets, TriviaQA (1) has
	relatively complex, compositional questions,  (2)  has considerable 
	syntactic and  lexical                                      variability  between     
	     
	questions and 
	corresponding 
	answer-evidence  sentences,  and  (3) requires more cross sentence reasoning to
	find answers.  We also present two baseline algorithms: a feature-based
	classifier and a state-of-the-art neural network, that performs well on SQuAD
	reading comprehension. Neither approach comes close to human performance (23%
	and 40% vs. 80%), suggesting that TriviaQA is a challenging testbed that is
	worth significant future study.},
  url       = {http://aclweb.org/anthology/P17-1147}
}

@InProceedings{richardson-kuhn:2017:Long,
  author    = {Richardson, Kyle  and  Kuhn, Jonas},
  title     = {Learning Semantic Correspondences in Technical Documentation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1612--1622},
  abstract  = {We consider the problem of translating high-level textual descriptions to
	formal representations in technical documentation as part of an effort to model
	the meaning of such documentation.  We focus specifically on the problem of
	learning translational correspondences between text descriptions and grounded
	representations in the target documentation, such as formal representation of
	functions or code templates.  Our approach exploits the parallel nature of such
	documentation, or the tight coupling between high-level text and the low-level
	representations we aim to learn. Data is collected by mining technical
	documents for such parallel text-representation pairs, which we use to train a
	simple semantic parsing model. We report new baseline results on sixteen novel
	datasets, including the standard library documentation for nine popular
	programming languages across seven natural languages, and a small collection of
	Unix utility manuals.},
  url       = {http://aclweb.org/anthology/P17-1148}
}

@InProceedings{cao-EtAl:2017:Long1,
  author    = {Cao, Yixin  and  Huang, Lifu  and  Ji, Heng  and  Chen, Xu  and  Li, Juanzi},
  title     = {Bridge Text and Knowledge by Learning Multi-Prototype Entity Mention Embedding},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1623--1633},
  abstract  = {Integrating text and knowledge into a unified semantic space has attracted
	significant research interests recently. However, the ambiguity in the common
	space remains a challenge, namely that the same mention phrase usually refers
	to various entities. In this paper, to deal with the ambiguity of entity
	mentions, we propose a novel Multi-Prototype Mention Embedding model, which
	learns multiple sense embeddings for each mention by jointly modeling words
	from textual contexts and entities derived from a knowledge base. In addition,
	we further design an efficient language model based approach to disambiguate
	each mention to a specific sense. In experiments, both qualitative and
	quantitative analysis demonstrate the high quality of the word, entity and
	multi-prototype mention embeddings. Using entity linking as a study case, we
	apply our disambiguation method as well as the multi-prototype mention
	embeddings on the benchmark dataset, and achieve state-of-the-art performance.},
  url       = {http://aclweb.org/anthology/P17-1149}
}

@InProceedings{she-chai:2017:Long,
  author    = {She, Lanbo  and  Chai, Joyce},
  title     = {Interactive Learning of Grounded Verb Semantics towards Human-Robot Communication},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1634--1644},
  abstract  = {To enable human-robot communication and collaboration, previous works represent
	grounded verb semantics as the potential change of state to the physical world
	caused by these verbs. Grounded verb semantics are acquired mainly based on the
	parallel data of the use of a verb phrase and its corresponding sequences of
	primitive actions demonstrated by humans. The
	rich interaction between teachers and students that is considered important in
	learning new skills has not yet been explored. To address this limitation, this
	paper presents a new interactive learning approach that allows robots to
	proactively engage in interaction with human partners by asking good questions
	to learn models for grounded verb semantics. The proposed approach uses
	reinforcement learning to allow the robot to acquire an optimal policy for its
	question-asking behaviors by maximizing the long-term reward. Our empirical
	results have shown that the interactive learning approach leads to more
	reliable models for grounded verb semantics, especially in the noisy
	environment which is full of uncertainties. Compared to previous work, the
	models acquired from interactive learning result in a 48% to 145% performance
	gain when applied in new situations.},
  url       = {http://aclweb.org/anthology/P17-1150}
}

@InProceedings{athiwaratkun-wilson:2017:Long,
  author    = {Athiwaratkun, Ben  and  Wilson, Andrew},
  title     = {Multimodal Word Distributions},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1645--1656},
  abstract  = {Word embeddings provide point representations of words containing useful
	semantic information. 
	We introduce multimodal word distributions formed from Gaussian mixtures, for
	multiple word meanings, entailment, and rich uncertainty information.  To learn
	these distributions, we propose an energy-based max-margin objective. We show
	that the resulting approach captures uniquely  expressive semantic information,
	and outperforms alternatives, such as word2vec skip-grams, and Gaussian
	embeddings, on benchmark datasets such as word similarity and entailment.},
  url       = {http://aclweb.org/anthology/P17-1151}
}

@InProceedings{chen-EtAl:2017:Long3,
  author    = {Chen, Qian  and  Zhu, Xiaodan  and  Ling, Zhen-Hua  and  Wei, Si  and  Jiang, Hui  and  Inkpen, Diana},
  title     = {Enhanced LSTM for Natural Language Inference},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1657--1668},
  abstract  = {Reasoning and inference are central to human and artificial intelligence.
	Modeling inference in human language is very challenging. With the availability
	of large annotated data (Bowman et al., 2015), it has recently become feasible
	to train neural network based inference models, which have shown to be very
	effective. In this paper, we present a new state-of-the-art result, achieving
	the accuracy of 88.6% on the Stanford Natural Language Inference Dataset.
	Unlike the previous top models that use very complicated network architectures,
	we first demonstrate that carefully designing sequential inference models based
	on chain LSTMs can outperform all previous models. Based on this, we further
	show that by explicitly considering recursive architectures in both local
	inference modeling and inference composition, we achieve additional
	improvement. Particularly, incorporating syntactic parsing information
	contributes to our best result---it further improves the performance even when
	added to the already very strong model.},
  url       = {http://aclweb.org/anthology/P17-1152}
}

@InProceedings{ramakrishna-EtAl:2017:Long,
  author    = {Ramakrishna, Anil  and  Mart\'{i}nez, Victor R.  and  Malandrakis, Nikolaos  and  Singla, Karan  and  Narayanan, Shrikanth},
  title     = {Linguistic analysis of differences in portrayal of movie characters},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1669--1678},
  abstract  = {We examine differences in portrayal of characters in movies using
	psycholinguistic and graph theoretic measures computed directly from
	screenplays. Differences are examined with respect to characters' gender, race,
	age and other metadata. Psycholinguistic metrics are extrapolated to dialogues
	in movies using a linear regression model built on a set of manually annotated
	seed words. Interesting patterns are revealed about relationships between
	genders of production team and the gender ratio of characters. Several
	correlations are noted between gender, race, age of characters and the
	linguistic metrics.},
  url       = {http://aclweb.org/anthology/P17-1153}
}

@InProceedings{qian-EtAl:2017:Long,
  author    = {Qian, Qiao  and  Huang, Minlie  and  Lei, Jinhao  and  Zhu, Xiaoyan},
  title     = {Linguistically Regularized LSTM for Sentiment Classification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1679--1689},
  abstract  = {This paper deals with sentence-level sentiment classification. Though a variety
	of neural network models have been proposed recently, however, previous models
	either depend on expensive phrase-level annotation, most of which has
	remarkably degraded performance when trained with only sentence-level
	annotation; or do not fully employ linguistic resources (e.g., sentiment
	lexicons, negation words, intensity words). In this paper, we propose simple
	models trained with sentence-level annotation, but also attempt to model the
	linguistic role of sentiment lexicons, negation words, and intensity words.
	Results show that our models are able to capture the linguistic role of
	sentiment words, negation words, and intensity words in sentiment expression.},
  url       = {http://aclweb.org/anthology/P17-1154}
}

@InProceedings{peled-reichart:2017:Long,
  author    = {Peled, Lotem  and  Reichart, Roi},
  title     = {Sarcasm SIGN: Interpreting Sarcasm with Sentiment Based Monolingual Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1690--1700},
  abstract  = {Sarcasm is a form of speech in which speakers say the opposite of what they
	truly mean in order to convey a strong sentiment. In other words, ”Sarcasm is
	the giant chasm between what I say, and the person who doesn’t get it.”. In
	this paper we present the novel task of sarcasm interpretation, defined as the
	generation of a non-sarcastic utterance conveying the same message as the
	original sarcastic one. We introduce a novel dataset of 3000 sarcastic tweets,
	each interpreted by five human judges. Addressing the task as monolingual
	machine translation (MT), we experiment with MT algorithms and evaluation
	measures. We then present SIGN:
	an MT based sarcasm interpretation algorithm that targets sentiment words, a
	defining element of textual sarcasm. We show that while the scores of n-gram
	based automatic measures are similar for all interpretation models, SIGN’s
	interpretations are scored higher by humans for adequacy and sentiment
	polarity. We conclude with a discussion on future research directions for our
	new task.},
  url       = {http://aclweb.org/anthology/P17-1155}
}

@InProceedings{wu-huang-yan:2017:Long,
  author    = {Wu, Fangzhao  and  Huang, Yongfeng  and  Yan, Jun},
  title     = {Active Sentiment Domain Adaptation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1701--1711},
  abstract  = {Domain adaptation is an important technology to handle domain dependence
	problem in sentiment analysis field. Existing methods usually rely on sentiment
	classifiers trained in source domains. However, their performance may heavily
	decline if the distributions of sentiment features in source and target domains
	have significant difference. In this paper, we propose an active sentiment
	domain adaptation approach to handle this problem. Instead of the source domain
	sentiment classifiers, our approach adapts the general-purpose sentiment
	lexicons to target domain with the help of a small number of labeled samples
	which are selected and annotated in an active learning mode, as well as the
	domain-specific sentiment similarities among words mined from unlabeled samples
	of target domain. A unified model is proposed to fuse different types of
	sentiment information and train sentiment classifier for target domain.
	Extensive experiments on benchmark datasets show that our approach can train
	accurate sentiment classifier with less labeled samples.},
  url       = {http://aclweb.org/anthology/P17-1156}
}

@InProceedings{rekabsaz-EtAl:2017:Long,
  author    = {Rekabsaz, Navid  and  Lupu, Mihai  and  Baklanov, Artem  and  D\"{u}r, Alexander  and  Andersson, Linda  and  Hanbury, Allan},
  title     = {Volatility Prediction using Financial Disclosures Sentiments with Word Embedding-based IR Models},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1712--1721},
  abstract  = {Volatility prediction—an essential concept in financial markets—has
	recently been addressed using sentiment analysis methods. We investigate the
	sentiment of annual disclosures of companies in stock markets to forecast
	volatility. We specifically explore the use of recent Information Retrieval
	(IR) term weighting models that are effectively extended by related terms using
	word embeddings. In parallel to textual information, factual market data have
	been widely used as the mainstream approach to forecast market risk. We
	therefore study different fusion methods to combine text and market data
	resources. Our word embedding-based approach significantly outperforms
	state-of-the-art methods. In addition, we investigate the characteristics of
	the reports of the companies in different financial sectors.},
  url       = {http://aclweb.org/anthology/P17-1157}
}

@InProceedings{tu-EtAl:2017:Long,
  author    = {Tu, Cunchao  and  Liu, Han  and  Liu, Zhiyuan  and  Sun, Maosong},
  title     = {CANE: Context-Aware Network Embedding for Relation Modeling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1722--1731},
  abstract  = {Network embedding (NE) is playing a critical role in network analysis, due to
	its ability to represent vertices with efficient low-dimensional embedding
	vectors. However, existing NE models aim to learn a fixed context-free
	embedding for each vertex and neglect the diverse roles when interacting with
	other vertices. In this paper, we assume that one vertex usually shows
	different aspects when interacting with different neighbor vertices, and should
	own different embeddings respectively. Therefore, we present Context-Aware
	Network Embedding (CANE), a novel NE model to address this issue. CANE learns
	context-aware embeddings for vertices with mutual attention mechanism and is
	expected to model the semantic relationships between vertices more precisely.
	In experiments, we compare our model with existing NE models on three
	real-world datasets. Experimental results show that CANE achieves significant
	improvement than state-of-the-art methods on link prediction and comparable
	performance on vertex classification. The source code and datasets can be
	obtained from https://github.com/thunlp/CANE},
  url       = {http://aclweb.org/anthology/P17-1158}
}

@InProceedings{wang-EtAl:2017:Long6,
  author    = {Wang, Hongmin  and  Zhang, Yue  and  Chan, GuangYong Leonard  and  Yang, Jie  and  Chieu, Hai Leong},
  title     = {Universal Dependencies Parsing for Colloquial Singaporean English},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1732--1744},
  abstract  = {Singlish can be interesting to the ACL community both linguistically as a major
	creole based on English, and computationally for information extraction and
	sentiment analysis of regional social media. We investigate dependency parsing
	of Singlish by constructing a dependency treebank under the Universal
	Dependencies scheme, and then training a neural network model by integrating
	English syntactic knowledge into a state-of-the-art parser trained on the
	Singlish treebank. Results show that English knowledge can lead to 25% relative
	error reduction, resulting in a parser of 84.47% accuracies. To the best of our
	knowledge, we are the first to use neural stacking to improve cross-lingual
	dependency parsing on low-resource languages. We make both our annotation and
	parser available for further research.},
  url       = {http://aclweb.org/anthology/P17-1159}
}

@InProceedings{ylijyra-gomezrodriguez:2017:Long,
  author    = {Yli-Jyr\"{a}, Anssi  and  G\'{o}mez-Rodr\'{i}guez, Carlos},
  title     = {Generic Axiomatization of Families of Noncrossing Graphs in Dependency Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1745--1755},
  abstract  = {We present a simple encoding for unlabeled noncrossing graphs and show how its
	latent counterpart helps us to represent several families of directed and
	undirected graphs used in syntactic and semantic parsing of natural language as
	context-free languages.  The families are separated purely on the basis of
	forbidden patterns in latent encoding, eliminating the need to differentiate
	the families of non-crossing graphs in inference algorithms: one algorithm
	works for all when the search space can be controlled in parser input.},
  url       = {http://aclweb.org/anthology/P17-1160}
}

@InProceedings{peters-EtAl:2017:Long,
  author    = {Peters, Matthew  and  Ammar, Waleed  and  Bhagavatula, Chandra  and  Power, Russell},
  title     = {Semi-supervised sequence tagging with bidirectional language models},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1756--1765},
  abstract  = {Pre-trained word embeddings learned from unlabeled text have become a stan-
	dard component of neural network archi- tectures for NLP tasks. However, in
	most cases, the recurrent network that oper- ates on word-level representations
	to pro- duce context sensitive representations is trained on relatively little
	labeled data. In this paper, we demonstrate a general semi-supervised approach
	for adding pre- trained context embeddings from bidi- rectional language models
	to NLP sys- tems and apply it to sequence labeling tasks. We evaluate our model
	on two stan- dard datasets for named entity recognition (NER) and chunking, and
	in both cases achieve state of the art results, surpassing previous systems
	that use other forms of transfer or joint learning with additional labeled data
	and task specific gazetteers.},
  url       = {http://aclweb.org/anthology/P17-1161}
}

@InProceedings{he-EtAl:2017:Long4,
  author    = {He, He  and  Balakrishnan, Anusha  and  Eric, Mihail  and  Liang, Percy},
  title     = {Learning Symmetric Collaborative Dialogue Agents with Dynamic Knowledge Graph Embeddings},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1766--1776},
  abstract  = {We study a symmetric collaborative dialogue setting
	in which two agents, each with private knowledge,
	must strategically communicate to achieve a common goal.
	The open-ended dialogue state in this setting poses new challenges for existing
	dialogue systems.
	We collected a dataset of 11K human-human dialogues,
	which exhibits interesting lexical, semantic, and strategic elements.
	To model
	both structured knowledge and unstructured language,
	we propose a neural model with dynamic knowledge graph embeddings
	that evolve as the dialogue progresses.
	Automatic and human evaluations show that our model is both more effective
	at achieving the goal and more human-like than baseline neural and rule-based
	models.},
  url       = {http://aclweb.org/anthology/P17-1162}
}

@InProceedings{mrkvsic-EtAl:2017:Long,
  author    = {Mrk\v{s}i\'{c}, Nikola  and  \'{O} S\'{e}aghdha, Diarmuid  and  Wen, Tsung-Hsien  and  Thomson, Blaise  and  Young, Steve},
  title     = {Neural Belief Tracker: Data-Driven Dialogue State Tracking},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1777--1788},
  abstract  = {One of the core components of modern spoken dialogue systems is the belief
	tracker, which estimates the user's goal at every step of the dialogue.
	However, most current approaches have difficulty scaling to larger, more
	complex dialogue domains. This is due to their dependency on either: a) Spoken
	Language Understanding models that require large amounts of annotated training
	data; or b) hand-crafted lexicons for capturing some of the linguistic
	variation in users' language. We propose a novel Neural Belief Tracking (NBT)
	framework which overcomes these problems by building on recent advances in
	representation learning. NBT models reason over pre-trained word vectors,
	learning to compose them into distributed representations of user utterances
	and dialogue context. Our evaluation on two datasets shows that this approach
	surpasses past limitations, matching the performance of state-of-the-art models
	which rely on hand-crafted semantic lexicons and outperforming them when such
	lexicons are not provided.},
  url       = {http://aclweb.org/anthology/P17-1163}
}

@InProceedings{liu-EtAl:2017:Long2,
  author    = {Liu, Shulin  and  Chen, Yubo  and  Liu, Kang  and  Zhao, Jun},
  title     = {Exploiting Argument Information to Improve Event Detection via Supervised Attention Mechanisms},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1789--1798},
  abstract  = {This paper tackles the task of event detection (ED), which involves identifying
	and categorizing events. We argue that arguments provide significant clues to
	this task, but they are either completely ignored or exploited in an indirect
	manner in existing detection approaches. In this work, we propose to exploit
	argument information explicitly for ED via supervised attention mechanisms. In
	specific, we systematically investigate the proposed model under the
	supervision of different attention strategies. Experimental results show that
	our approach advances state-of-the-arts and achieves the best F1 score on
	ACE 2005 dataset.},
  url       = {http://aclweb.org/anthology/P17-1164}
}

@InProceedings{amoualian-EtAl:2017:Long,
  author    = {Amoualian, Hesam  and  Lu, Wei  and  Gaussier, Eric  and  Balikas, Georgios  and  Amini, Massih R  and  Clausel, Marianne},
  title     = {Topical Coherence in LDA-based Models through Induced Segmentation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1799--1809},
  abstract  = {This paper presents an LDA-based model that generates topically coherent
	segments within documents by jointly segmenting documents and assigning topics
	to their words. The coherence between topics is ensured through a copula,
	binding the topics associated to the words of a segment. In addition, this
	model relies on both document and segment specific topic distributions so as to
	capture fine grained differences in topic assignments. We show that the
	proposed model naturally encompasses other state-of-the-art LDA-based models
	designed for similar tasks. Furthermore, our experiments, conducted on six
	different publicly available datasets, show the effectiveness of our model in
	terms of perplexity, Normalized Pointwise Mutual Information, which captures
	the coherence between the generated topics, and the Micro F1 measure for text
	classification.},
  url       = {http://aclweb.org/anthology/P17-1165}
}

@InProceedings{ye-EtAl:2017:Long1,
  author    = {Ye, Hai  and  Chao, Wenhan  and  Luo, Zhunchen  and  Li, Zhoujun},
  title     = {Jointly Extracting Relations with Class Ties via Effective Deep Ranking},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1810--1820},
  abstract  = {Connections between relations in relation extraction, which we call class ties,
	are common. In distantly supervised scenario, one entity tuple may have
	multiple relation facts. Exploiting class ties between relations of one entity
	tuple will be promising for distantly supervised relation extraction. However,
	previous models are not effective or ignore to model this property. In this
	work, to effectively leverage class ties, we propose to make joint relation
	extraction with a unified model that integrates convolutional neural network
	(CNN) with a general pairwise ranking framework, in which three novel ranking
	loss functions are introduced. Additionally, an effective method is presented
	to relieve the severe class imbalance problem from NR (not relation) for model
	training. Experiments on a widely used dataset show that leveraging class ties
	will enhance extraction and demonstrate the effectiveness of our model to learn
	class ties. Our model outperforms the baselines significantly, achieving
	state-of-the-art performance.},
  url       = {http://aclweb.org/anthology/P17-1166}
}

@InProceedings{iyyer-yih-chang:2017:Long,
  author    = {Iyyer, Mohit  and  Yih, Wen-tau  and  Chang, Ming-Wei},
  title     = {Search-based Neural Structured Learning for Sequential Question Answering},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1821--1831},
  abstract  = {Recent work in semantic parsing for question answering has focused on long and
	complicated questions, many of which would seem unnatural if asked in a normal
	conversation between two humans. In an effort to explore a conversational QA
	setting, we present a more realistic task: answering sequences of simple but
	inter-related questions. We collect a dataset of 6,066 question sequences that
	inquire about semi-structured tables from Wikipedia, with 17,553
	question-answer pairs in total. To solve this sequential question answering
	task, we propose a novel dynamic neural semantic parsing framework trained
	using a weakly supervised reward-guided search. Our model effectively leverages
	the sequential context to outperform state-of-the-art QA systems that are
	designed to answer highly complex questions.},
  url       = {http://aclweb.org/anthology/P17-1167}
}

@InProceedings{dhingra-EtAl:2017:Long2,
  author    = {Dhingra, Bhuwan  and  Liu, Hanxiao  and  Yang, Zhilin  and  Cohen, William  and  Salakhutdinov, Ruslan},
  title     = {Gated-Attention Readers for Text Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1832--1846},
  abstract  = {In this paper we study the problem of answering cloze-style questions over
	documents. Our model, the Gated-Attention (GA) Reader, integrates a multi-hop
	architecture with a novel attention mechanism, which is based on multiplicative
	interactions between the query embedding and the intermediate states of a
	recurrent neural network document reader. This enables the reader to build
	query-specific representations of tokens in the document for accurate answer
	selection. The GA Reader obtains state-of-the-art results on three benchmarks
	for this task--the CNN \& Daily Mail news stories and the Who Did What dataset.
	The effectiveness of multiplicative interaction is demonstrated by an ablation
	study, and by comparing to alternative compositional operators for implementing
	the gated-attention.},
  url       = {http://aclweb.org/anthology/P17-1168}
}

@InProceedings{ye-EtAl:2017:Long2,
  author    = {Ye, Jianbo  and  Li, Yanran  and  Wu, Zhaohui  and  Wang, James Z.  and  Li, Wenjie  and  Li, Jia},
  title     = {Determining Gains Acquired from Word Embedding Quantitatively Using Discrete Distribution Clustering},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1847--1856},
  abstract  = {Word embeddings have become widely-used in document analysis. While a large
	number of models for mapping words to vector spaces have been developed, it
	remains undetermined how much net gain can be achieved over traditional
	approaches based on bag-of-words. In this paper, we propose a new document
	clustering approach by combining any word embedding with a state-of-the-art
	algorithm for clustering empirical distributions. By using the Wasserstein
	distance between distributions, the word-to-word semantic relationship is taken
	into account in a principled way. The new clustering method is easy to use and
	consistently outperforms other methods on a variety of data sets. More
	importantly, the method provides an effective framework for determining when
	and how much word embeddings contribute to document analysis. Experimental
	results with multiple embedding models are reported.},
  url       = {http://aclweb.org/anthology/P17-1169}
}

@InProceedings{pilehvar-EtAl:2017:Long,
  author    = {Pilehvar, Mohammad Taher  and  Camacho-Collados, Jose  and  Navigli, Roberto  and  Collier, Nigel},
  title     = {Towards a Seamless Integration of Word Senses into Downstream NLP Applications},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1857--1869},
  abstract  = {Lexical ambiguity can impede NLP systems from accurate understanding of
	semantics. Despite its potential benefits, the integration of sense-level
	information into NLP systems has remained understudied. By incorporating a
	novel disambiguation algorithm into a state-of-the-art classification model, we
	create a pipeline to integrate sense-level information into downstream NLP
	applications. We show that a simple disambiguation of the input text can lead
	to consistent performance improvement on multiple topic categorization and
	polarity detection datasets, particularly when the fine granularity of the
	underlying sense inventory is reduced and the document is sufficiently large.
	Our results also point to the need for sense representation research to focus
	more on in vivo evaluations which target the performance in downstream NLP
	applications rather than artificial benchmarks.},
  url       = {http://aclweb.org/anthology/P17-1170}
}

@InProceedings{chen-EtAl:2017:Long4,
  author    = {Chen, Danqi  and  Fisch, Adam  and  Weston, Jason  and  Bordes, Antoine},
  title     = {Reading Wikipedia to Answer Open-Domain Questions},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1870--1879},
  abstract  = {This paper proposes to tackle open-domain question answering using Wikipedia as
	the unique knowledge source: the answer to any factoid question is a text span
	in a Wikipedia article. This task of machine reading at scale combines the
	challenges of document retrieval (finding the relevant articles) with that of
	machine comprehension of text (identifying the answer spans from those
	articles). Our approach combines a search component based on bigram hashing and
	TF-IDF matching with a multi-layer recurrent neural network model trained to
	detect answers in Wikipedia paragraphs. Our experiments on multiple existing QA
	datasets indicate that (1) both modules are highly competitive with respect to
	existing counterparts and (2) multitask learning using distant supervision on
	their combination is an effective complete system on this challenging task.},
  url       = {http://aclweb.org/anthology/P17-1171}
}

@InProceedings{yu-lee-le:2017:Long,
  author    = {Yu, Adams Wei  and  Lee, Hongrae  and  Le, Quoc},
  title     = {Learning to Skim Text},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1880--1890},
  abstract  = {Recurrent Neural Networks are showing much promise in many sub-areas of natural
	language processing, ranging from document classification to machine
	translation to automatic question answering. Despite their promise, many
	recurrent models have to read the whole text word by word, making it slow to
	handle long documents. For example, it is difficult to use a recurrent network
	to read a book and answer questions about it. In this paper, we present an
	approach of reading text while skipping irrelevant information if needed. The
	underlying model is a recurrent network that learns how far to jump after
	reading a few words of the input text. We employ a standard policy gradient
	method to train the model to make discrete jumping decisions. In our benchmarks
	on four different tasks, including number prediction, sentiment analysis, news
	article classification and automatic Q\&A, our proposed model, a modified LSTM
	with jumping, is up to 6 times faster than the standard sequential LSTM, while
	maintaining the same or even better accuracy.},
  url       = {http://aclweb.org/anthology/P17-1172}
}

@InProceedings{srikumar:2017:Long,
  author    = {Srikumar, Vivek},
  title     = {An Algebra for Feature Extraction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1891--1900},
  abstract  = {Though feature extraction is a necessary first step in statistical NLP, it is
	often seen as a mere preprocessing step. Yet, it can dominate computation time,
	both during training, and especially at deployment. In this paper, we formalize
	feature extraction from an algebraic perspective. Our formalization allows us
	to define a message passing algorithm that can restructure feature templates to
	be more computationally efficient. We show via experiments on text chunking and
	relation extraction that this restructuring does indeed speed up feature
	extraction in practice by reducing redundant computation.},
  url       = {http://aclweb.org/anthology/P17-1173}
}

@InProceedings{ishiwatari-EtAl:2017:Long,
  author    = {Ishiwatari, Shonosuke  and  Yao, Jingtao  and  Liu, Shujie  and  Li, Mu  and  Zhou, Ming  and  Yoshinaga, Naoki  and  Kitsuregawa, Masaru  and  Jia, Weijia},
  title     = {Chunk-based Decoder for Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1901--1912},
  abstract  = {Chunks (or phrases) once played a pivotal role in machine translation. By using
	a chunk rather than a word as the basic translation unit, local (intra-chunk)
	and global (inter-chunk) word orders and dependencies can be easily modeled.
	The chunk structure, despite its importance, has not been considered in the
	decoders used for neural machine translation (NMT). In this paper, we propose
	chunk-based decoders for (NMT), each of which consists of a chunk-level decoder
	and a word-level decoder. The chunk-level decoder models global dependencies
	while the word-level decoder decides the local word order in a chunk. To output
	a target sentence, the chunk-level decoder generates a chunk representation
	containing global information, which the word-level decoder then uses as a
	basis to predict the words inside the chunk. Experimental results show that our
	proposed decoders can significantly improve translation performance in a WAT
	'16 English-to-Japanese translation task.},
  url       = {http://aclweb.org/anthology/P17-1174}
}

@InProceedings{calixto-liu-campbell:2017:Long,
  author    = {Calixto, Iacer  and  Liu, Qun  and  Campbell, Nick},
  title     = {Doubly-Attentive Decoder for Multi-modal Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1913--1924},
  abstract  = {We introduce a Multi-modal Neural Machine Translation model in which a
	doubly-attentive decoder naturally incorporates spatial visual features
	obtained using pre-trained convolutional neural networks, bridging the gap
	between image description and translation. Our decoder learns to attend to
	source-language words and parts of an image independently by means of two
	separate attention mechanisms as it generates words in the target language. We
	find that our model can efficiently exploit not just back-translated in-domain
	multi-modal data but also large general-domain text-only MT corpora. We also
	report state-of-the-art results on the Multi30k data set.},
  url       = {http://aclweb.org/anthology/P17-1175}
}

@InProceedings{chen-EtAl:2017:Long5,
  author    = {Chen, Yun  and  Liu, Yang  and  Cheng, Yong  and  Li, Victor O.K.},
  title     = {A Teacher-Student Framework for Zero-Resource Neural Machine Translation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1925--1935},
  abstract  = {While end-to-end neural machine translation (NMT) has made remarkable progress
	recently, it still suffers from the data scarcity problem for low-resource
	language pairs and domains. In this paper, we propose a method for
	zero-resource NMT by assuming that parallel sentences have close probabilities
	of generating a sentence in a third language. Based on the assumption, our
	method is able to train a source-to-target NMT model (``student'') without
	parallel corpora available guided by an existing pivot-to-target NMT model
	(``teacher'') on a source-pivot parallel corpus. Experimental results show that
	the proposed method significantly improves over a baseline pivot-based model by
	+3.0 BLEU points across various language pairs.},
  url       = {http://aclweb.org/anthology/P17-1176}
}

@InProceedings{chen-EtAl:2017:Long6,
  author    = {Chen, Huadong  and  Huang, Shujian  and  Chiang, David  and  Chen, Jiajun},
  title     = {Improved Neural Machine Translation with a Syntax-Aware Encoder and Decoder},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1936--1945},
  abstract  = {Most neural machine translation (NMT) models are based on the sequential
	encoder-decoder framework, which makes no use of syntactic information. In this
	paper, we improve this model by explicitly incorporating source-side syntactic
	trees. More specifically, we propose (1) a bidirectional tree
	encoder which learns both sequential and tree structured representations; (2) a
	tree-coverage model that lets the attention depend on the source-side syntax.
	Experiments on Chinese-English translation demonstrate that our proposed models
	outperform the sequential attentional model as well as a stronger baseline with
	a bottom-up tree encoder and word coverage.},
  url       = {http://aclweb.org/anthology/P17-1177}
}

@InProceedings{pan-EtAl:2017:Long2,
  author    = {Pan, Xiaoman  and  Zhang, Boliang  and  May, Jonathan  and  Nothman, Joel  and  Knight, Kevin  and  Ji, Heng},
  title     = {Cross-lingual Name Tagging and Linking for 282 Languages},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1946--1958},
  abstract  = {The ambitious goal of this work is to develop a cross-lingual name tagging and
	linking framework for 282 languages that exist in Wikipedia. Given a document
	in any of these languages, our framework is able to identify name mentions,
	assign a coarse-grained or fine-grained type to each mention, and link it to an
	English Knowledge Base (KB) if it is linkable. We achieve this goal by
	performing a series of new KB mining methods: generating ``silver-standard''
	annotations by transferring annotations from English to other languages through
	cross-lingual links and KB properties, refining annotations through
	self-training and topic selection, deriving language-specific morphology
	features from anchor links, and mining word translation pairs from
	cross-lingual links. Both name tagging and linking results for 282 languages
	are promising on Wikipedia data and on-Wikipedia data.},
  url       = {http://aclweb.org/anthology/P17-1178}
}

@InProceedings{zhang-EtAl:2017:Long5,
  author    = {Zhang, Meng  and  Liu, Yang  and  Luan, Huanbo  and  Sun, Maosong},
  title     = {Adversarial Training for Unsupervised Bilingual Lexicon Induction},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1959--1970},
  abstract  = {Word embeddings are well known to capture linguistic regularities of the
	language on which they are trained. Researchers also observe that these
	regularities can transfer across languages. However, previous endeavors to
	connect separate monolingual word embeddings typically require cross-lingual
	signals as supervision, either in the form of parallel corpus or seed lexicon.
	In this work, we show that such cross-lingual connection can actually be
	established without any form of supervision. We achieve this end by formulating
	the problem as a natural adversarial game, and investigating techniques that
	are crucial to successful training. We carry out evaluation on the unsupervised
	bilingual lexicon induction task. Even though this task appears intrinsically
	cross-lingual, we are able to demonstrate encouraging performance without any
	cross-lingual clues.},
  url       = {http://aclweb.org/anthology/P17-1179}
}

@InProceedings{rijhwani-EtAl:2017:Long,
  author    = {Rijhwani, Shruti  and  Sequiera, Royal  and  Choudhury, Monojit  and  Bali, Kalika  and  Maddila, Chandra Shekhar},
  title     = {Estimating Code-Switching on Twitter with a Novel Generalized Word-Level Language Detection Technique},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1971--1982},
  abstract  = {Word-level language detection is necessary for analyzing code-switched text,
	where multiple languages could be mixed within a sentence. Existing models are
	restricted to code-switching between two specific languages and fail in
	real-world scenarios as text input rarely has a priori information on the
	languages used. We present a novel unsupervised word-level language detection
	technique for code-switched text for an arbitrarily large number of languages,
	which does not require any manually annotated training data. Our experiments
	with tweets in seven languages show a 74% relative error reduction in
	word-level labeling with respect to competitive baselines. We then use this
	system to conduct a large-scale quantitative analysis of code-switching
	patterns on Twitter, both global as well as region-specific, with 58M tweets.},
  url       = {http://aclweb.org/anthology/P17-1180}
}

@InProceedings{bloodgood-strauss:2017:Long,
  author    = {Bloodgood, Michael  and  Strauss, Benjamin},
  title     = {Using Global Constraints and Reranking to Improve Cognates Detection},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1983--1992},
  abstract  = {Global constraints and reranking have not been used in cognates detection
	research to date. We propose methods for using global constraints by performing
	rescoring of the score matrices produced by state of the art cognates detection
	systems. Using global constraints to perform rescoring is complementary to
	state of the art methods for performing cognates detection and results in
	significant performance improvements beyond current state of the art
	performance on publicly available datasets with different language pairs and
	various conditions such as different levels of baseline state of the art
	performance and different data size conditions, including with more realistic
	large data size conditions than have been evaluated with in the past.},
  url       = {http://aclweb.org/anthology/P17-1181}
}

@InProceedings{kann-cotterell-schutze:2017:Long,
  author    = {Kann, Katharina  and  Cotterell, Ryan  and  Sch\"{u}tze, Hinrich},
  title     = {One-Shot Neural Cross-Lingual Transfer for Paradigm Completion},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1993--2003},
  abstract  = {We present a novel cross-lingual transfer method for paradigm completion, the
	task of mapping a lemma to its inflected forms, using a neural encoder-decoder
	model, the state of the art for the monolingual task. We use labeled data from
	a high-resource language to increase performance on a low-resource language. In
	experiments on 21 language pairs from four different language families, we
	obtain up to 58% higher accuracy than without transfer and show that even
	zero-shot and one-shot learning are possible. We further find that the degree
	of language relatedness strongly influences the ability to transfer
	morphological knowledge.},
  url       = {http://aclweb.org/anthology/P17-1182}
}

@InProceedings{aharoni-goldberg:2017:Long,
  author    = {Aharoni, Roee  and  Goldberg, Yoav},
  title     = {Morphological Inflection Generation with Hard Monotonic Attention},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2004--2015},
  abstract  = {We present a neural model for morphological inflection generation which employs
	a hard attention mechanism, inspired by the nearly-monotonic alignment commonly
	found between the characters in a word and the characters in its inflection. We
	evaluate the model on three previously studied morphological inflection
	generation datasets and show that it provides state of the art results in
	various setups compared to previous neural and non-neural approaches. Finally
	we present an analysis of the continuous representations learned by both the
	hard and soft (Bahdanau, 2014) attention models for the task, shedding some
	light on the features such models extract.},
  url       = {http://aclweb.org/anthology/P17-1183}
}

@InProceedings{vania-lopez:2017:Long,
  author    = {Vania, Clara  and  Lopez, Adam},
  title     = {From Characters to Words to in Between: Do We Capture Morphology?},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2016--2027},
  abstract  = {Words can be represented by composing the representations of subword units such
	as word segments, characters, and/or character n-grams. While such
	representations are effective and may capture the morphological regularities of
	words, they have not been systematically compared, and it is not understood how
	they interact with different morphological typologies. On a language modeling
	task, we present experiments that systematically vary (1) the basic unit of
	representation, (2) the composition of these representations, and (3) the
	morphological typology of the language modeled. Our results extend previous
	findings that character representations are effective across typologies, and we
	find that a previously unstudied combination of character trigram
	representations composed with bi-LSTMs outperforms most others. But we also
	find room for improvement: none of the character-level models match the
	predictive accuracy of a model with access to true morphological analyses, even
	when learned from an order of magnitude more data.},
  url       = {http://aclweb.org/anthology/P17-1184}
}

@InProceedings{fonarev-EtAl:2017:Long,
  author    = {Fonarev, Alexander  and  Grinchuk, Oleksii  and  Gusev, Gleb  and  Serdyukov, Pavel  and  Oseledets, Ivan},
  title     = {Riemannian Optimization for Skip-Gram Negative Sampling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2028--2036},
  abstract  = {Skip-Gram Negative Sampling (SGNS) word embedding model, well known by its
	implementation in ``word2vec'' software, is usually optimized by stochastic
	gradient descent. However, the optimization of SGNS objective can be viewed as
	a problem of searching for a good matrix with the low-rank constraint. The most
	standard way to solve this type of problems is to apply Riemannian optimization
	framework to optimize the SGNS objective over the manifold of required low-rank
	matrices. In this paper, we propose an algorithm that optimizes SGNS objective
	using Riemannian optimization and demonstrates its superiority over popular
	competitors, such as the original method to train SGNS and SVD over SPPMI
	matrix.},
  url       = {http://aclweb.org/anthology/P17-1185}
}

@InProceedings{peng-thomson-smith:2017:Long,
  author    = {Peng, Hao  and  Thomson, Sam  and  Smith, Noah A.},
  title     = {Deep Multitask Learning for Semantic Dependency Parsing},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2037--2048},
  abstract  = {We present a deep neural architecture that parses sentences into three semantic
	dependency graph formalisms. By using efficient, nearly arc-factored inference
	and a bidirectional-LSTM composed with a multi-layer perceptron,  our base
	system is able to significantly improve the state of the art for semantic
	dependency parsing, without using hand-engineered features or syntax. We then
	explore two multitask learning approaches---one that shares parameters across
	formalisms, and one that uses higher-order structures to predict the graphs
	jointly. We find that both approaches improve performance across formalisms on
	average, achieving a new state of the art. Our code is open-source and
	available at https://github.com/Noahs-ARK/NeurboParser.},
  url       = {http://aclweb.org/anthology/P17-1186}
}

@InProceedings{niu-EtAl:2017:Long,
  author    = {Niu, Yilin  and  Xie, Ruobing  and  Liu, Zhiyuan  and  Sun, Maosong},
  title     = {Improved Word Representation Learning with Sememes},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2049--2058},
  abstract  = {Sememes are minimum semantic units of word meanings, and the meaning of each
	word sense is typically composed by several sememes. Since sememes are not
	explicit for each word, people manually annotate word sememes and form
	linguistic common-sense knowledge bases. In this paper, we present that, word
	sememe information can improve word representation learning (WRL), which maps
	words into a low-dimensional semantic space and serves as a fundamental step
	for many NLP tasks. The key idea is to utilize word sememes to capture exact
	meanings of a word within specific contexts accurately. More specifically, we
	follow the framework of Skip-gram and present three sememe-encoded models to
	learn representations of sememes, senses and words, where we apply the
	attention scheme to detect word senses in various contexts. We conduct
	experiments on two tasks including word similarity and word analogy, and our
	models significantly outperform baselines. The results indicate that WRL can
	benefit from sememes via the attention scheme, and also confirm our models
	being capable of correctly modeling sememe information.},
  url       = {http://aclweb.org/anthology/P17-1187}
}

@InProceedings{liu-EtAl:2017:Long3,
  author    = {Liu, Frederick  and  Lu, Han  and  Lo, Chieh  and  Neubig, Graham},
  title     = {Learning Character-level Compositionality with Visual Features},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2059--2068},
  abstract  = {Previous work has modeled the compositionality of words by creating
	character-level models of meaning, reducing problems of sparsity for rare
	words. However, in many writing systems compositionality has an effect even on
	the character-level: the meaning of a character is derived by the sum of its
	parts. In this paper, we model this effect by creating embeddings for
	characters based on their visual characteristics, creating an image for the
	character and running it through a convolutional neural network to produce a
	visual character embedding. Experiments on a text classification task
	demonstrate that such model allows for better processing of instances with rare
	characters in languages such as Chinese, Japanese, and Korean. Additionally,
	qualitative analyses demonstrate that our proposed model learns to focus on the
	parts of characters that carry topical content which resulting in embeddings
	that are coherent in visual space.},
  url       = {http://aclweb.org/anthology/P17-1188}
}

@InProceedings{xia-EtAl:2017:Long,
  author    = {Xia, Qiaolin  and  Sha, Lei  and  Chang, Baobao  and  Sui, Zhifang},
  title     = {A Progressive Learning Approach to Chinese SRL Using Heterogeneous Data},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2069--2077},
  abstract  = {Previous studies on Chinese semantic role labeling (SRL) have concentrated on a
	single semantically annotated corpus. But the training data of single corpus is
	often limited. Whereas the other existing semantically annotated corpora for
	Chinese SRL are scattered across different annotation frameworks. But still,
	Data sparsity remains a bottleneck. This situation calls for larger training
	datasets, or effective approaches which can take advantage of highly
	heterogeneous data. In this paper, we focus mainly on the latter, that is, to
	improve Chinese SRL by using heterogeneous corpora together. We propose a novel
	progressive learning model which augments the Progressive Neural Network with
	Gated Recurrent Adapters. The model can accommodate heterogeneous inputs and
	effectively transfer knowledge between them. We also release a new corpus,
	Chinese SemBank, for Chinese SRL. Experiments on CPB 1.0 show that our model
	outperforms state-of-the-art methods.},
  url       = {http://aclweb.org/anthology/P17-1189}
}

@InProceedings{wieting-gimpel:2017:Long,
  author    = {Wieting, John  and  Gimpel, Kevin},
  title     = {Revisiting Recurrent Networks for Paraphrastic Sentence Embeddings},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2078--2088},
  abstract  = {We consider the problem of learning general-purpose, paraphrastic sentence
	embeddings, revisiting the setting of Wieting et al. (2016b). While they found
	LSTM recurrent networks to underperform word averaging, we present several
	developments that together produce the opposite conclusion. These include
	training on sentence pairs rather than phrase pairs, averaging states to
	represent sequences, and regularizing aggressively. These improve LSTMs in both
	transfer learning and supervised settings. We also introduce a new recurrent
	architecture, the Gated Recurrent Averaging Network, that is inspired by
	averaging and LSTMs while outperforming them both. We analyze our learned
	models, finding evidence of preferences for particular parts of speech and
	dependency relations.},
  url       = {http://aclweb.org/anthology/P17-1190}
}

@InProceedings{dasigi-EtAl:2017:Long,
  author    = {Dasigi, Pradeep  and  Ammar, Waleed  and  Dyer, Chris  and  Hovy, Eduard},
  title     = {Ontology-Aware Token Embeddings for Prepositional Phrase Attachment},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2089--2098},
  abstract  = {Type-level word embeddings use the same set of parameters to represent all
	instances of a word regardless of its context, ignoring the inherent lexical
	ambiguity in language. Instead, we embed semantic concepts (or synsets) as
	defined in WordNet and represent a word token in a particular context by
	estimating a distribution over relevant semantic concepts. We use the new,
	context-sensitive embeddings in a model for predicting prepositional phrase
	(PP) attachments and jointly learn the concept embeddings and model parameters.
	We show that using context-sensitive embeddings improves the accuracy of the PP
	attachment model by 5.4% absolute points, which amounts to a 34.4% relative
	reduction in errors.},
  url       = {http://aclweb.org/anthology/P17-1191}
}

@InProceedings{pavlick-pasca:2017:Long,
  author    = {Pavlick, Ellie  and  Pasca, Marius},
  title     = {Identifying 1950s American Jazz Musicians: Fine-Grained IsA Extraction via Modifier Composition},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2099--2109},
  abstract  = {We present a method for populating fine-grained classes (e.g., “1950s
	American jazz musicians”) with instances (e.g., Charles Mingus ). While
	state-of-the-art methods tend to treat class labels as single lexical units,
	the proposed method considers each of the individual modifiers in the class
	label relative to the head. An evaluation on the task of reconstructing
	Wikipedia category pages demonstrates a >10 point increase in AUC, over a
	strong baseline relying on widely-used Hearst patterns.},
  url       = {http://aclweb.org/anthology/P17-1192}
}

@InProceedings{cao-EtAl:2017:Long2,
  author    = {Cao, Junjie  and  Huang, Sheng  and  Sun, Weiwei  and  Wan, Xiaojun},
  title     = {Parsing to 1-Endpoint-Crossing, Pagenumber-2 Graphs},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2110--2120},
  abstract  = {We study the Maximum Subgraph problem in deep dependency parsing. We consider
	two restrictions to deep dependency graphs: (a) 1-endpoint-crossing and (b)
	pagenumber-2. Our main contribution is an exact algorithm that ob-
	tains maximum subgraphs satisfying both restrictions simultaneously in time
	O(n5).
	Moreover, ignoring one linguistically-rare structure descreases the complexity
	to
	O(n4). We
	also extend our quartic-time algorithm into
	a practical parser with a discriminative disambiguation model and evaluate its
	performance on four linguistic data sets used in
	semantic dependency parsing.},
  url       = {http://aclweb.org/anthology/P17-1193}
}

@InProceedings{rei:2017:Long,
  author    = {Rei, Marek},
  title     = {Semi-supervised Multitask Learning for Sequence Labeling},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2121--2130},
  abstract  = {We propose a sequence labeling framework with a secondary training objective,
	learning to predict surrounding words for every word in the dataset.
	This language modeling objective incentivises the system to learn
	general-purpose patterns of semantic and syntactic composition, which are also
	useful for improving accuracy on different sequence labeling tasks.
	The architecture was evaluated on a range of datasets, covering the tasks of
	error detection in learner texts, named entity recognition, chunking and
	POS-tagging.
	The novel language modeling objective provided consistent performance
	improvements on every benchmark, without requiring any additional annotated or
	unannotated data.},
  url       = {http://aclweb.org/anthology/P17-1194}
}

@InProceedings{matsuzaki-EtAl:2017:Long,
  author    = {Matsuzaki, Takuya  and  Ito, Takumi  and  Iwane, Hidenao  and  Anai, Hirokazu  and  H. Arai, Noriko},
  title     = {Semantic Parsing of Pre-university Math Problems},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {2131--2141},
  abstract  = {We have been developing an end-to-end math problem solving system that accepts
	natural language input.
	The current paper focuses on how we analyze the problem sentences to produce
	logical forms.
	We chose a hybrid approach combining a shallow syntactic analyzer and a
	manually-developed lexicalized grammar.
	A feature of the grammar is that it is extensively typed on the basis of a
	formal ontology for pre-university math.
	These types are helpful in semantic disambiguation inside and across sentences.
	Experimental results show that the hybrid system produces a well-formed logical
	form with 88% precision and 56% recall.},
  url       = {http://aclweb.org/anthology/P17-1195}
}

