@Book{EMNLP2017:2017,
  editor    = {Martha Palmer - University of Colorado  and  Rebecca Hwa - University of Pittsburgh  and  Sebastian Riedel - University College London},
  title     = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  url       = {https://www.aclweb.org/anthology/D17-1}
}

@InProceedings{arase-tsujii:2017:EMNLP2017,
  author    = {Arase, Yuki  and  Tsujii, Jun'ichi},
  title     = {Monolingual Phrase Alignment on Parse Forests},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1--11},
  abstract  = {We propose an efficient method to conduct phrase alignment on parse forests for
	paraphrase detection. Unlike previous studies, our method identifies syntactic
	paraphrases under linguistically motivated grammar. In addition, it allows
	phrases to non-compositionally align to handle paraphrases with non-homographic
	phrase correspondences. 
	A dataset that provides gold parse trees and their phrase alignments is
	created. The experimental results confirm that the proposed method conducts
	highly accurate phrase alignment compared to human performance.},
  url       = {https://www.aclweb.org/anthology/D17-1001}
}

@InProceedings{shi-huang-lee:2017:EMNLP2017,
  author    = {Shi, Tianze  and  Huang, Liang  and  Lee, Lillian},
  title     = {Fast(er) Exact Decoding and Global Training for Transition-Based Dependency Parsing via a Minimal Feature Set},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {12--23},
  abstract  = {We first present a minimal feature set for transition-based dependency parsing,
	continuing a recent trend started by Kiperwasser and Goldberg (2016a) and Cross
	and Huang (2016a) of using bi-directional LSTM features. We plug our minimal
	feature set into the dynamic-programming framework of Huang and Sagae (2010)
	and Kuhlmann et al. (2011) to produce the first implementation of worst-case
	O(n\^{}3) exact decoders for arc-hybrid and arc-eager transition systems. With our
	minimal features, we also present O(n\^{}3) global training methods. Finally,
	using ensembles including our new parsers, we achieve the best unlabeled
	attachment score reported (to our knowledge) on the Chinese Treebank and the
	``second-best-in-class'' result on the English Penn Treebank.},
  url       = {https://www.aclweb.org/anthology/D17-1002}
}

@InProceedings{cao-EtAl:2017:EMNLP2017,
  author    = {Cao, Junjie  and  Huang, Sheng  and  Sun, Weiwei  and  Wan, Xiaojun},
  title     = {Quasi-Second-Order Parsing for 1-Endpoint-Crossing, Pagenumber-2 Graphs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {24--34},
  abstract  = {We propose a new Maximum Subgraph algorithm for first-order parsing to
	1-endpoint-crossing, pagenumber-2 graphs. Our algorithm has two
	characteristics: (1) it separates the construction for noncrossing edges and
	crossing edges; (2) in a single construction step, whether to create a new arc
	is deterministic. These two characteristics make our algorithm relatively easy
	to be extended to incorporiate crossing-sensitive second-order features. We
	then introduce a new algorithm for quasi-second-order parsing. Experiments
	demonstrate that second-order features are helpful for Maximum Subgraph
	parsing.},
  url       = {https://www.aclweb.org/anthology/D17-1003}
}

@InProceedings{zhang-EtAl:2017:EMNLP20171,
  author    = {Zhang, Yuhao  and  Zhong, Victor  and  Chen, Danqi  and  Angeli, Gabor  and  Manning, Christopher D.},
  title     = {Position-aware Attention and Supervised Data Improve Slot Filling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {35--45},
  abstract  = {Organized relational knowledge in the form of "knowledge graphs" is important
	for many applications. However, the ability to populate knowledge bases with
	facts automatically extracted from documents has improved frustratingly slowly.
	This paper simultaneously addresses two issues that have held back prior work.
	We first propose an effective new model, which combines an LSTM sequence model
	with a form of entity position-aware attention that is better suited to
	relation extraction. Then we build TACRED, a large (119,474 examples)
	supervised relation extraction dataset obtained via crowdsourcing and targeted
	towards TAC KBP relations. The combination of better supervised data and a more
	appropriate high-capacity model enables much better relation extraction
	performance. When the model trained on this new dataset replaces the previous
	relation extraction component of the best TAC KBP 2015 slot filling system, its
	F1 score increases markedly from 22.2% to 26.7%.},
  url       = {https://www.aclweb.org/anthology/D17-1004}
}

@InProceedings{liu-EtAl:2017:EMNLP20171,
  author    = {Liu, Liyuan  and  Ren, Xiang  and  Zhu, Qi  and  Zhi, Shi  and  Gui, Huan  and  Ji, Heng  and  Han, Jiawei},
  title     = {Heterogeneous Supervision for Relation Extraction: A Representation Learning Approach},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {46--56},
  abstract  = {Relation extraction is a fundamental task in information extraction.
	Most existing methods have heavy reliance on annotations labeled by human
	experts, which are costly and time-consuming.
	To overcome this drawback, we propose a novel framework, REHession, to conduct
	relation extractor learning using annotations from heterogeneous information
	source, e.g., knowledge base and domain heuristics.
	These annotations, referred as heterogeneous supervision, often conflict with
	each other, which brings a new challenge to the original relation extraction
	task: how to infer the true label from noisy labels for a given instance.
	Identifying context information as the backbone of both relation extraction and
	true label discovery, we adopt embedding techniques to learn the distributed
	representations of context, which bridges all components with mutual
	enhancement in an iterative fashion.
	Extensive experimental results demonstrate the superiority of REHession over
	the state-of-the-art.},
  url       = {https://www.aclweb.org/anthology/D17-1005}
}

@InProceedings{wang-zhang-chang:2017:EMNLP2017,
  author    = {Wang, Zhongqing  and  Zhang, Yue  and  Chang, Ching-Yun},
  title     = {Integrating Order Information and Event Relation for Script Event Prediction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {57--67},
  abstract  = {There has been a recent line of work automatically learning scripts from
	unstructured texts, by modeling narrative event chains. While the dominant
	approach group events using event pair relations, LSTMs have been used to
	encode full chains of narrative events. The latter has the advantage of
	learning long-range temporal orders, yet the former is more adaptive to partial
	orders. We propose a neural model that leverages the advantages of both
	methods, by using LSTM hidden states as features for event pair modelling. A
	dynamic memory network is utilized to automatically induce weights on existing
	events for inferring a subsequent event. Standard evaluation shows that our
	method significantly outperforms both methods above, giving the best results
	reported so far.},
  url       = {https://www.aclweb.org/anthology/D17-1006}
}

@InProceedings{tan-EtAl:2017:EMNLP2017,
  author    = {Tan, Chuanqi  and  Wei, Furu  and  Ren, Pengjie  and  Lv, Weifeng  and  Zhou, Ming},
  title     = {Entity Linking for Queries by Searching Wikipedia Sentences},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {68--77},
  abstract  = {We present a simple yet effective approach for linking entities in queries. The
	key idea is to search sentences similar to a query from Wikipedia articles and
	directly use the human-annotated entities in the similar sentences as candidate
	entities for the query. Then, we employ a rich set of features, such as
	link-probability, context-matching, word embeddings, and relatedness among
	candidate entities as well as their related entities, to rank the candidates
	under a regression based framework. The advantages of our approach lie in two
	aspects, which contribute to the ranking process and final linking result.
	First, it can greatly reduce the number of candidate entities by filtering out
	irrelevant entities with the words in the query. Second, we can obtain the
	query sensitive prior probability in addition to the static link-probability
	derived from all Wikipedia articles. We conduct experiments on two benchmark
	datasets on entity linking for queries, namely the ERD14 dataset and the GERDAQ
	dataset. Experimental results show that our method outperforms state-of-the-art
	systems and yields 75.0% in F1 on the ERD14 dataset and 56.9% on the GERDAQ
	dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1007}
}

@InProceedings{pasini-navigli:2017:EMNLP2017,
  author    = {Pasini, Tommaso  and  Navigli, Roberto},
  title     = {Train-O-Matic: Large-Scale Supervised Word Sense Disambiguation in Multiple Languages without Manual Training Data},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {78--88},
  abstract  = {Annotating large numbers of sentences with senses is the heaviest requirement
	of current Word Sense Disambiguation. We present Train-O-Matic, a
	language-independent method for generating millions of sense-annotated training
	instances for virtually all meanings of words in a language's vocabulary. The
	approach is fully automatic: no human intervention is required and the only
	type of human knowledge used is a WordNet-like resource. Train-O-Matic achieves
	consistently state-of-the-art performance across gold standard datasets and
	languages, while at the same time removing the burden of manual annotation. All
	the training data is available for research purposes at http://trainomatic.org.},
  url       = {https://www.aclweb.org/anthology/D17-1008}
}

@InProceedings{reddy-EtAl:2017:EMNLP2017,
  author    = {Reddy, Siva  and  T\"{a}ckstr\"{o}m, Oscar  and  Petrov, Slav  and  Steedman, Mark  and  Lapata, Mirella},
  title     = {Universal Semantic Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {89--101},
  abstract  = {Universal Dependencies (UD) offer a uniform cross-lingual syntactic
	representation, with the aim of advancing multilingual applications. Recent
	work shows that semantic parsing can be accomplished by transforming syntactic
	dependencies to logical forms. However, this work is limited to English, and
	cannot process dependency graphs, which allow handling complex phenomena such
	as control. In this work, we introduce UDepLambda, a semantic interface for UD,
	which maps natural language to logical forms in an almost language-independent
	fashion and can process dependency graphs. We perform experiments on question
	answering against Freebase and provide German and Spanish translations of the
	WebQuestions and GraphQuestions datasets to facilitate multilingual evaluation.
	Results show that UDepLambda outperforms strong baselines across languages and
	datasets.  For English, it achieves a 4.9 F1 point improvement over the
	state-of-the-art on GraphQuestions.},
  url       = {https://www.aclweb.org/anthology/D17-1009}
}

@InProceedings{pinter-guthrie-eisenstein:2017:EMNLP2017,
  author    = {Pinter, Yuval  and  Guthrie, Robert  and  Eisenstein, Jacob},
  title     = {Mimicking Word Embeddings using Subword RNNs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {102--112},
  abstract  = {Word embeddings improve generalization over lexical features by placing each
	word in a lower-dimensional space, using distributional information obtained
	from unlabeled data. However, the effectiveness of word embeddings for
	downstream NLP tasks is limited by out-of-vocabulary (OOV) words, for which
	embeddings do not exist. In this paper, we present MIMICK, an approach to
	generating OOV word embeddings compositionally, by learning a function from
	spellings to distributional embeddings. Unlike prior work, MIMICK does not
	require re-training on the original word embedding corpus; instead, learning is
	performed at the type level. Intrinsic and extrinsic evaluations demonstrate
	the power of this simple approach. On 23 languages, MIMICK improves
	performance over a word-based baseline for tagging part-of-speech and
	morphosyntactic attributes. It is competitive with (and complementary to) a
	supervised character-based model in low resource settings.},
  url       = {https://www.aclweb.org/anthology/D17-1010}
}

@InProceedings{asgari-schutze:2017:EMNLP2017,
  author    = {Asgari, Ehsaneddin  and  Sch\"{u}tze, Hinrich},
  title     = {Past, Present, Future: A Computational Investigation of the Typology of Tense in 1000 Languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {113--124},
  abstract  = {We present SuperPivot, an analysis method for low-resource languages that occur
	in a superparallel corpus, i.e., in a corpus that contains an order of
	magnitude more languages than parallel corpora currently in use. We show that
	SuperPivot performs well for the crosslingual analysis of the linguistic
	phenomenon of tense. We produce analysis results for more than 1000 languages,
	conducting -- to the best of our knowledge -- the largest crosslingual
	computational study performed to date. We extend existing methodology for
	leveraging parallel corpora for typological analysis by overcoming a limiting
	assumption of earlier work: We only require that a linguistic feature is
	overtly marked in a few of thousands of languages as opposed to requiring that
	it be marked in all languages under investigation.},
  url       = {https://www.aclweb.org/anthology/D17-1011}
}

@InProceedings{hashimoto-tsuruoka:2017:EMNLP2017,
  author    = {Hashimoto, Kazuma  and  Tsuruoka, Yoshimasa},
  title     = {Neural Machine Translation with Source-Side Latent Graph Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {125--135},
  abstract  = {This paper presents a novel neural machine translation model which jointly
	learns translation and source-side latent graph representations of sentences.
	Unlike existing pipelined approaches using syntactic parsers, our end-to-end
	model learns a latent graph parser as part of the encoder of an attention-based
	neural machine translation model, and thus the parser is optimized according to
	the translation objective.
	In experiments, we first show that our model compares favorably with
	state-of-the-art sequential and pipelined syntax-based NMT models.
	We also show that the performance of our model can be further improved by
	pre-training it with a small amount of treebank annotations.
	Our final ensemble model significantly outperforms the previous best models on
	the standard English-to-Japanese translation dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1012}
}

@InProceedings{weng-EtAl:2017:EMNLP2017,
  author    = {Weng, Rongxiang  and  Huang, Shujian  and  Zheng, Zaixiang  and  DAI, XIN-YU  and  CHEN, Jiajun},
  title     = {Neural Machine Translation with Word Predictions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {136--145},
  abstract  = {In the encoder-decoder architecture for neural machine translation (NMT), the
	hidden states of the recurrent structures in the encoder and decoder carry the
	crucial information about the sentence. These vectors are generated by
	parameters which are updated by back-propagation of translation errors through
	time.We argue that propagating errors through the end-to-end recurrent
	structures are not a direct way of control the hidden vectors. 
	In this paper, we propose to use word predictions as a mechanism for direct
	supervision. More specifically, we require these vectors to be able to predict
	the vocabulary in target sentence. Our simple mechanism ensures better
	representations in the encoder and decoder without using any extra data or
	annotation. It is also helpful in reducing the target side vocabulary and
	improving the decoding efficiency. Experiments on Chinese-English machine
	translation task show an average BLEU improvement by 4.53, respectively.},
  url       = {https://www.aclweb.org/anthology/D17-1013}
}

@InProceedings{hoang-haffari-cohn:2017:EMNLP2017,
  author    = {Hoang, Cong Duy Vu  and  Haffari, Gholamreza  and  Cohn, Trevor},
  title     = {Towards Decoding as Continuous Optimisation in Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {146--156},
  abstract  = {We propose a novel decoding approach for neural machine translation (NMT) based
	on continuous optimisation. We reformulate decoding, a discrete optimization
	problem, into a continuous problem, such that optimization can make use of
	efficient gradient-based techniques. Our powerful decoding framework allows for
	more accurate decoding for standard neural machine translation models, as well
	as enabling decoding in intractable models such as intersection of several
	different NMT models. Our empirical results show that our decoding framework is
	effective, and can leads to substantial improvements in translations,
	especially in situations where greedy search and beam search are not feasible.
	Finally, we show how the technique is highly competitive with, and
	complementary to, reranking.},
  url       = {https://www.aclweb.org/anthology/D17-1014}
}

@InProceedings{kitaev-klein:2017:EMNLP2017,
  author    = {Kitaev, Nikita  and  Klein, Dan},
  title     = {Where is Misty? Interpreting Spatial Descriptors by Modeling Regions in Space},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {157--166},
  abstract  = {We present a model for locating regions in space based on natural language
	descriptions. Starting with a 3D scene and a sentence, our model is able to
	associate words in the sentence with regions in the scene, interpret relations
	such as `on top of' or `next to,' and finally locate the region described in
	the sentence. All components form a single neural network that is trained
	end-to-end without prior knowledge of object segmentation. To evaluate our
	model, we construct and release a new dataset consisting of Minecraft scenes
	with crowdsourced natural language descriptions. We achieve a 32% relative
	error reduction compared to a strong neural baseline.},
  url       = {https://www.aclweb.org/anthology/D17-1015}
}

@InProceedings{rahimi-baldwin-cohn:2017:EMNLP2017,
  author    = {Rahimi, Afshin  and  Baldwin, Timothy  and  Cohn, Trevor},
  title     = {Continuous Representation of Location for Geolocation and Lexical Dialectology using Mixture Density Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {167--176},
  abstract  = {We propose a method for embedding two-dimensional locations in a continuous
	vector space using a neural network-based model incorporating mixtures of
	Gaussian distributions, presenting two model variants for text-based
	geolocation and lexical dialectology. Evaluated over Twitter data, the proposed
	model outperforms conventional regression-based geolocation and provides a
	better estimate of uncertainty. We also show the effectiveness of the
	representation for predicting words from location in lexical dialectology, and
	evaluate it using the DARE dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1016}
}

@InProceedings{yin-ordonez:2017:EMNLP2017,
  author    = {Yin, Xuwang  and  Ordonez, Vicente},
  title     = {Obj2Text: Generating Visually Descriptive Language from Object Layouts},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {177--187},
  abstract  = {Generating captions for images is a task that has recently received
	considerable attention. Another type of visual inputs are abstract scenes or
	object layouts where the only information provided is a set of objects and
	their locations. This type of imagery is commonly found in many applications in
	computer graphics, virtual reality, and storyboarding. We explore in this paper
	OBJ2TEXT, a sequence-to-sequence model that encodes a set of objects and their
	locations as an input sequence using an LSTM network, and decodes this
	representation using an LSTM language model. We show in our paper that this
	model despite using a sequence encoder can effectively represent complex
	spatial object-object relationships and produce descriptions that are globally
	coherent and semantically relevant. We test our approach for the task of
	describing object layouts in the MS-COCO dataset by producing sentences given
	only object annotations. We additionally show that our model combined with a
	state-of-the-art object detector can improve the accuracy of an image
	captioning model.},
  url       = {https://www.aclweb.org/anthology/D17-1017}
}

@InProceedings{lee-EtAl:2017:EMNLP2017,
  author    = {Lee, Kenton  and  He, Luheng  and  Lewis, Mike  and  Zettlemoyer, Luke},
  title     = {End-to-end Neural Coreference Resolution},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {188--197},
  abstract  = {We introduce the first end-to-end coreference resolution model and show that it
	significantly outperforms all previous work without using a syntactic parser or
	hand-engineered mention detector. The key idea is to directly consider all
	spans in a document as potential mentions and learn distributions over possible
	antecedents for each. The model computes span embeddings that combine
	context-dependent boundary representations with a head-finding attention
	mechanism. It is trained to maximize the marginal likelihood of gold antecedent
	spans from coreference clusters and is factored to enable aggressive pruning of
	potential mentions. Experiments demonstrate state-of-the-art performance, with
	a gain of 1.5 F1 on the OntoNotes benchmark and by 3.1 F1 using a 5-model
	ensemble, despite the fact that this is the first approach to be successfully
	trained with no external resources.},
  url       = {https://www.aclweb.org/anthology/D17-1018}
}

@InProceedings{li-jurafsky:2017:EMNLP2017,
  author    = {Li, Jiwei  and  Jurafsky, Dan},
  title     = {Neural Net Models of Open-domain Discourse Coherence},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {198--209},
  abstract  = {Discourse coherence is strongly associated with text quality,
	making it important to natural language generation and understanding.
	Yet existing models of coherence focus on measuring individual aspects of
	coherence
	(lexical overlap, rhetorical structure, entity centering) in narrow domains.
	In this paper, we describe domain-independent neural models
	of discourse coherence that are capable of measuring multiple aspects of
	coherence 
	in existing sentences and can maintain coherence while generating new
	sentences.
	We study both
	discriminative models that learn to distinguish coherent from incoherent
	discourse,
	and generative models that produce coherent text,
	including a novel neural latent-variable Markovian generative model that 
	captures the latent discourse dependencies between sentences in a text.
	Our work achieves state-of-the-art performance on multiple coherence
	evaluations,
	and marks an initial step in generating coherent texts given discourse
	contexts.},
  url       = {https://www.aclweb.org/anthology/D17-1019}
}

@InProceedings{wang-EtAl:2017:EMNLP20171,
  author    = {Wang, Kexiang  and  Liu, Tianyu  and  Sui, Zhifang  and  Chang, Baobao},
  title     = {Affinity-Preserving Random Walk for Multi-Document Summarization},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {210--220},
  abstract  = {Multi-document summarization provides users with a short text that summarizes
	the information in a set of related documents. This paper introduces
	affinity-preserving random walk to the summarization task, which preserves the
	affinity relations of sentences by an absorbing random walk model. Meanwhile,
	we put forward adjustable affinity-preserving random walk to enforce the
	diversity constraint of summarization in the random walk process. The ROUGE
	evaluations on DUC 2003 topic-focused summarization task and DUC 2004 generic
	summarization task show the good performance of our method, which has the best
	ROUGE-2 recall among the graph-based ranking methods.
	Author{3}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1020}
}

@InProceedings{marasovic-EtAl:2017:EMNLP2017,
  author    = {Marasovic, Ana  and  Born, Leo  and  Opitz, Juri  and  Frank, Anette},
  title     = {A Mention-Ranking Model for Abstract Anaphora Resolution},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {221--232},
  abstract  = {Resolving abstract anaphora is an important, but difficult task for text
	understanding. Yet, with recent advances in representation learning this task
	becomes a more tangible aim. A central property of abstract anaphora is that it
	establishes a relation between the anaphor embedded in the anaphoric sentence
	and its (typically non-nominal) antecedent. We propose a mention-ranking model
	that learns how abstract anaphors relate to their antecedents with an
	LSTM-Siamese Net. We overcome the lack of training data by generating
	artificial anaphoric sentence--antecedent pairs. Our model outperforms
	state-of-the-art results on shell noun resolution. We  also report first
	benchmark results on an abstract anaphora subset of the ARRAU corpus. This
	corpus presents a greater challenge due to a mixture of nominal and pronominal
	anaphors and a greater range of confounders. We found model variants that
	outperform the baselines for nominal anaphors, without training on individual
	anaphor data, but still lag behind for pronominal anaphors. Our model selects
	syntactically plausible candidates and -- if disregarding syntax --
	discriminates candidates using deeper features.},
  url       = {https://www.aclweb.org/anthology/D17-1021}
}

@InProceedings{nguyen-EtAl:2017:EMNLP2017,
  author    = {Nguyen, Kim Anh  and  K\"{o}per, Maximilian  and  Schulte im Walde, Sabine  and  Vu, Ngoc Thang},
  title     = {Hierarchical Embeddings for Hypernymy Detection and Directionality},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {233--243},
  abstract  = {We present a novel neural model HyperVec to learn hierarchical embeddings for
	hypernymy detection and directionality. While previous embeddings have shown
	limitations on prototypical hypernyms, HyperVec represents an unsupervised
	measure where embeddings are learned in a specific order and capture the
	hypernym--hyponym distributional hierarchy. Moreover, our model is able to
	generalize over unseen hypernymy pairs, when using only small sets of training
	data, and by mapping to other languages. Results on benchmark datasets show
	that HyperVec outperforms both state-of-the- art unsupervised measures and
	embedding models on hypernymy detection and directionality, and on predicting
	graded lexical entailment.},
  url       = {https://www.aclweb.org/anthology/D17-1022}
}

@InProceedings{zhao-EtAl:2017:EMNLP20171,
  author    = {Zhao, Zhe  and  Liu, Tao  and  Li, Shen  and  Li, Bofang  and  Du, Xiaoyong},
  title     = {Ngram2vec: Learning Improved Word Representations from Ngram Co-occurrence Statistics},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {244--253},
  abstract  = {The existing word representation methods mostly limit their information source
	to word co-occurrence statistics. In this paper, we introduce ngrams into four
	representation methods: SGNS, GloVe, PPMI matrix, and its SVD factorization.
	Comprehensive experiments are conducted on word analogy and similarity tasks.
	The results show that improved word representations are learned from ngram
	co-occurrence statistics. We also demonstrate that the trained ngram
	representations are useful in many aspects such as finding antonyms and
	collocations. Besides, a novel approach of building co-occurrence matrix is
	proposed to alleviate the hardware burdens brought by ngrams.},
  url       = {https://www.aclweb.org/anthology/D17-1023}
}

@InProceedings{tissier-gravier-habrard:2017:EMNLP2017,
  author    = {Tissier, Julien  and  Gravier, Christopher  and  Habrard, Amaury},
  title     = {Dict2vec : Learning Word Embeddings using Lexical Dictionaries},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {254--263},
  abstract  = {Learning word embeddings on large unlabeled corpus has been shown to be
	successful in improving many natural language tasks. The most efficient and
	popular approaches learn or retrofit such representations using additional
	external data. Resulting embeddings are generally better than their corpus-only
	counterparts, although such resources cover a fraction of words in the
	vocabulary. In this paper, we propose a new approach, Dict2vec, based on one of
	the largest yet refined datasource for describing words -- natural language
	dictionaries. Dict2vec builds new word pairs from dictionary entries so that
	semantically-related words are moved closer, and negative sampling filters out
	pairs whose words are unrelated in dictionaries. We evaluate the word
	representations obtained using Dict2vec on eleven datasets for the word
	similarity task and on four datasets for a text classification task.},
  url       = {https://www.aclweb.org/anthology/D17-1024}
}

@InProceedings{su-lee:2017:EMNLP2017,
  author    = {Su, Tzu-ray  and  Lee, Hung-yi},
  title     = {Learning Chinese Word Representations From Glyphs Of Characters},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {264--273},
  abstract  = {In this paper, we propose new methods to learn Chinese word representations.
	Chinese characters are composed of graphical components, which carry rich
	semantics. It is common for a Chinese learner to comprehend the meaning of a
	word from these graphical components. As a result, we propose models that
	enhance word representations by character glyphs. The character glyph features
	are directly learned from the bitmaps of characters by convolutional
	auto-encoder(convAE), and the glyph features improve Chinese word
	representations which are already enhanced by character embeddings. Another
	contribution in this paper is that we created several evaluation datasets in
	traditional Chinese and made them public.},
  url       = {https://www.aclweb.org/anthology/D17-1025}
}

@InProceedings{wieting-mallinson-gimpel:2017:EMNLP2017,
  author    = {Wieting, John  and  Mallinson, Jonathan  and  Gimpel, Kevin},
  title     = {Learning Paraphrastic Sentence Embeddings from Back-Translated Bitext},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {274--285},
  abstract  = {We consider the problem of learning general-purpose, paraphrastic sentence
	embeddings in the setting of Wieting et al. (2016b). We use neural machine
	translation to generate sentential paraphrases via back-translation of
	bilingual sentence pairs. We evaluate the paraphrase pairs by their ability to
	serve as training data for learning paraphrastic sentence embeddings. We find
	that the data quality is stronger than prior work based on bitext and on par
	with manually-written English paraphrase pairs, with the advantage that our
	approach can scale up to generate large training sets for many languages and
	domains. We experiment with several language pairs and data sources, and
	develop a variety of data filtering techniques. In the process, we explore how
	neural machine translation output differs from human-written sentences, finding
	clear differences in length, the amount of repetition, and the use of rare
	words.},
  url       = {https://www.aclweb.org/anthology/D17-1026}
}

@InProceedings{yu-EtAl:2017:EMNLP20171,
  author    = {Yu, Jinxing  and  Jian, Xun  and  Xin, Hao  and  Song, Yangqiu},
  title     = {Joint Embeddings of Chinese Words, Characters, and Fine-grained Subcharacter Components},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {286--291},
  abstract  = {Word embeddings have attracted much attention recently. Different from
	alphabetic writing systems, Chinese characters are often composed of
	subcharacter components which are also semantically informative. In this work,
	we propose an approach to jointly embed Chinese words as well as their
	characters and fine-grained subcharacter components. We use three likelihoods
	to evaluate whether the context words, characters, and components can predict
	the current target word, and collected 13,253 subcharacter components to
	demonstrate the existing approaches of decomposing Chinese characters are not
	enough. Evaluation on both word similarity and word analogy tasks demonstrates
	the superior performance of our model.},
  url       = {https://www.aclweb.org/anthology/D17-1027}
}

@InProceedings{gupta-EtAl:2017:EMNLP2017,
  author    = {Gupta, Arihant  and  Akhtar, Syed Sarfaraz  and  Vajpayee, Avijit  and  Srivastava, Arjit  and  Jhanwar, Madan Gopal  and  Shrivastava, Manish},
  title     = {Exploiting Morphological Regularities in Distributional Word Representations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {292--297},
  abstract  = {We present an unsupervised, language agnostic approach for exploiting
	morphological regularities present in high dimensional vector spaces. We
	propose a novel method for generating embeddings of words from their
	morphological variants using morphological transformation operators. We
	evaluate this approach on MSR word analogy test set with an accuracy of 85%
	which is 12% higher than the previous best known system.},
  url       = {https://www.aclweb.org/anthology/D17-1028}
}

@InProceedings{wang-zhang-zong:2017:EMNLP2017,
  author    = {Wang, Shaonan  and  Zhang, Jiajun  and  Zong, Chengqing},
  title     = {Exploiting Word Internal Structures for Generic Chinese Sentence Representation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {298--303},
  abstract  = {We introduce a novel mixed characterword architecture to improve Chinese
	sentence representations, by utilizing rich semantic information of word
	internal structures. Our architecture uses two key strategies. The first is a
	mask gate on characters, learning the relation among characters in a word. The
	second is a maxpooling operation on words, adaptively finding the optimal
	mixture of the atomic and compositional word representations. Finally, the
	proposed architecture is applied to various sentence composition models, which
	achieves substantial performance gains over baseline models on sentence
	similarity task.},
  url       = {https://www.aclweb.org/anthology/D17-1029}
}

@InProceedings{herbelot-baroni:2017:EMNLP2017,
  author    = {Herbelot, Aur\'{e}lie  and  Baroni, Marco},
  title     = {High-risk learning: acquiring new word vectors from tiny data},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {304--309},
  abstract  = {Distributional semantics models are known to struggle with small data. It is
	generally accepted that in order to learn 'a good vector' for a word, a model
	must have sufficient examples of its usage. This contradicts the fact that
	humans can guess the meaning of a word from a few occurrences only.  In this
	paper, we show that a neural language model such as Word2Vec only necessitates
	minor modifications to its standard architecture to learn new terms from tiny
	data, using background knowledge from a previously learnt semantic space. We
	test our model on word definitions and on a nonce task involving 2-6 sentences'
	worth of context, showing a large increase in performance over state-of-the-art
	models on the definitional task.},
  url       = {https://www.aclweb.org/anthology/D17-1030}
}

@InProceedings{sanu-EtAl:2017:EMNLP2017,
  author    = {Sanu, Joseph  and  Xu, Mingbin  and  Jiang, Hui  and  Liu, Quan},
  title     = {Word Embeddings based on Fixed-Size Ordinally Forgetting Encoding},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {310--315},
  abstract  = {In this paper, we propose to learn word embeddings based on the recent
	fixed-size ordinally forgetting encoding (FOFE) method, which can almost
	uniquely encode any variable-length sequence into a fixed-size representation.
	We use FOFE to fully encode the left and right context of each word in a corpus
	to construct a novel word-context matrix, which is further weighted and
	factorized using truncated SVD to generate low-dimension word embedding
	vectors. We evaluate this alternate method in encoding word-context statistics
	and show the new FOFE method has a notable effect on the resulting word
	embeddings. Experimental results on several popular word similarity tasks have
	demonstrated that the proposed method  outperforms other SVD models that use
	canonical count based techniques to generate word context matrices.},
  url       = {https://www.aclweb.org/anthology/D17-1031}
}

@InProceedings{fernandez-yu-downey:2017:EMNLP2017,
  author    = {Fernandez, Jared  and  Yu, Zhaocheng  and  Downey, Doug},
  title     = {VecShare: A Framework for Sharing Word Representation Vectors},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {316--320},
  abstract  = {Many Natural Language Processing (NLP) models rely on distributed vector
	representations of words.  Because the process of training word vectors can
	require large amounts of data and computation, NLP researchers and
	practitioners often utilize pre-trained embeddings downloaded from the Web. 
	However, finding the best embeddings for a given task is difficult, and can be
	computationally prohibitive.  We present a framework, called VecShare, that
	makes it easy to share and retrieve word embeddings on the Web.  The framework
	leverages a public data-sharing infrastructure to host embedding sets, and
	provides automated mechanisms for retrieving the embeddings most similar to a
	given corpus.  We perform an experimental evaluation of VecShare's similarity
	strategies, and show that they are effective at efficiently retrieving
	embeddings that boost accuracy in a document classification task.  Finally, we
	provide an open-source Python library for using the VecShare framework.},
  url       = {https://www.aclweb.org/anthology/D17-1032}
}

@InProceedings{hasan-curry:2017:EMNLP2017,
  author    = {Hasan, Souleiman  and  Curry, Edward},
  title     = {Word Re-Embedding via Manifold Dimensionality Retention},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {321--326},
  abstract  = {Word embeddings seek to recover a Euclidean metric space by mapping words into
	vectors, starting from words co-occurrences in a corpus. Word embeddings may
	underestimate the similarity between nearby words, and overestimate it between
	distant words in the Euclidean metric space. In this paper, we re-embed
	pre-trained word embeddings with a stage of manifold learning which retains
	dimensionality. We show that this approach is
	theoretically founded in the metric recovery paradigm, and empirically show
	that it can improve on state-of-the-art embeddings in word similarity tasks 0.5
	- 5.0% points depending on the original space.},
  url       = {https://www.aclweb.org/anthology/D17-1033}
}

@InProceedings{lee-chen:2017:EMNLP2017,
  author    = {Lee, Guang-He  and  Chen, Yun-Nung},
  title     = {MUSE: Modularizing Unsupervised Sense Embeddings},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {327--337},
  abstract  = {This paper proposes to address the word sense ambiguity issue in an
	unsupervised manner, where word sense representations are learned along a word
	sense selection mechanism given contexts. Prior work focused on designing a
	single model to deliver both mechanisms, and thus suffered from either
	coarse-grained representation learning or inefficient sense selection. The
	proposed modular approach, MUSE, implements flexible modules to optimize
	distinct mechanisms, achieving the first purely sense-level representation
	learning system with linear-time sense selection. We leverage reinforcement
	learning to enable joint training on the proposed modules, and introduce
	various exploration techniques on sense selection for better robustness. The
	experiments on benchmark data show that the proposed approach achieves the
	state-of-the-art performance on synonym selection as well as on contextual word
	similarities in terms of MaxSimC.},
  url       = {https://www.aclweb.org/anthology/D17-1034}
}

@InProceedings{reimers-gurevych:2017:EMNLP2017,
  author    = {Reimers, Nils  and  Gurevych, Iryna},
  title     = {Reporting Score Distributions Makes a Difference: Performance Study of LSTM-networks for Sequence Tagging},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {338--348},
  abstract  = {In this paper we show that reporting a single performance score is insufficient
	to compare non-deterministic approaches. We demonstrate for common sequence
	tagging tasks that the seed value for the random number generator can result in
	statistically significant p < 10\^{}-4 differences for state-of-the-art systems.
	For two recent systems for NER, we observe an absolute difference of one
	percentage point F\_1-score depending on the selected seed value, making these
	systems perceived either as state-of-the-art or mediocre. Instead of publishing
	and reporting single performance scores, we propose to compare score
	distributions based on multiple executions. 
	Based on the evaluation of 50.000 LSTM-networks for five sequence tagging
	tasks, we present network architectures that produce both superior performance
	as well as are more stable with respect to the remaining hyperparameters.},
  url       = {https://www.aclweb.org/anthology/D17-1035}
}

@InProceedings{martins-kreutzer:2017:EMNLP2017,
  author    = {Martins, Andr\'{e} F. T.  and  Kreutzer, Julia},
  title     = {Learning What's Easy: Fully Differentiable Neural Easy-First Taggers},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {349--362},
  abstract  = {We introduce a novel neural easy-first decoder that learns to solve sequence
	tagging tasks in a flexible order. In contrast to previous easy-first decoders,
	our models are end-to-end differentiable. The decoder iteratively updates a
	“sketch” of the predictions over the sequence. At its core is an attention
	mechanism that controls which parts of the input are strategically the best to
	process next. We present a new constrained softmax transformation that ensures
	the same cumulative attention to every word, and show how to efficiently
	evaluate and backpropagate over it. Our models compare favourably to BILSTM
	taggers on three sequence tagging tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1036}
}

@InProceedings{kaji-kobayashi:2017:EMNLP2017,
  author    = {Kaji, Nobuhiro  and  Kobayashi, Hayato},
  title     = {Incremental Skip-gram Model with Negative Sampling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {363--371},
  abstract  = {This paper explores an incremental training strategy for the skip-gram model
	with negative sampling (SGNS) from both empirical and theoretical perspectives.
	Existing methods of neural word embeddings, including SGNS, are multi-pass
	algorithms and thus cannot perform incremental model update. To address this
	problem, we present a simple incremental extension of SGNS and provide a
	thorough theoretical analysis to demonstrate its validity. Empirical
	experiments demonstrated the correctness of the theoretical analysis as well as
	the practical usefulness of the incremental algorithm.},
  url       = {https://www.aclweb.org/anthology/D17-1037}
}

@InProceedings{ruder-plank:2017:EMNLP2017,
  author    = {Ruder, Sebastian  and  Plank, Barbara},
  title     = {Learning to select data for transfer learning with Bayesian Optimization},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {372--382},
  abstract  = {Domain similarity measures can be used to gauge adaptability and select
	suitable data for transfer learning, but existing approaches define ad hoc
	measures that are deemed suitable for respective tasks. Inspired by work on
	curriculum learning, we propose to learn data selection measures using Bayesian
	Optimization and evaluate them across models, domains and tasks.
	Our learned measures outperform existing domain similarity measures
	significantly on three tasks: sentiment analysis, part-of-speech tagging, and
	parsing.  We show the importance of complementing similarity with diversity,
	and that learned measures are--to some degree--transferable across models,
	domains, and even tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1038}
}

@InProceedings{ramachandran-liu-le:2017:EMNLP2017,
  author    = {Ramachandran, Prajit  and  Liu, Peter  and  Le, Quoc},
  title     = {Unsupervised Pretraining for Sequence to Sequence Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {383--391},
  abstract  = {This work presents a general unsupervised learning method to improve the
	accuracy of sequence to sequence (seq2seq) models. In our method, the weights
	of the encoder and decoder of a seq2seq model are initialized with the
	pretrained weights of two language models and then fine-tuned with labeled
	data. We apply this method to challenging benchmarks in machine translation and
	abstractive summarization and find that it significantly improves the
	subsequent supervised models.  Our main result is that pretraining improves the
	generalization of seq2seq models. We achieve state-of-the-art results on the
	WMT English$\rightarrow$German task, surpassing a range of methods using both
	phrase-based machine translation and neural machine translation. Our method
	achieves a significant improvement of 1.3 BLEU from th previous best models on
	both WMT'14 and WMT'15 English$\rightarrow$German. We also conduct human
	evaluations on abstractive summarization and find that our method outperforms a
	purely supervised learning baseline in a statistically significant manner.},
  url       = {https://www.aclweb.org/anthology/D17-1039}
}

@InProceedings{britz-guan-luong:2017:EMNLP2017,
  author    = {Britz, Denny  and  Guan, Melody  and  Luong, Minh-Thang},
  title     = {Efficient Attention using a Fixed-Size Memory Representation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {392--400},
  abstract  = {The standard content-based attention mechanism typically used in
	sequence-to-sequence models is computationally expensive as it requires the
	comparison of large encoder and decoder states at each time step. In this work,
	we propose an alternative attention mechanism based on a fixed size memory
	representation that is more efficient. Our technique predicts a compact set of
	K attention contexts during encoding and lets the decoder compute an efficient
	lookup that does not need to consult the memory. We show that our approach
	performs on-par with the standard attention mechanism while yielding inference
	speedups of 20% for real-world translation tasks and more for tasks with longer
	sequences. By visualizing attention scores we demonstrate that our models learn
	distinct, meaningful alignments.},
  url       = {https://www.aclweb.org/anthology/D17-1040}
}

@InProceedings{park-bak-oh:2017:EMNLP2017,
  author    = {Park, Sungjoon  and  Bak, JinYeong  and  Oh, Alice},
  title     = {Rotated Word Vector Representations and their Interpretability},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {401--411},
  abstract  = {Vector representation of words improves performance in various NLP tasks, but
	the high dimensional word vectors are very difficult to interpret. We apply
	several rotation algorithms to the vector representation of words to improve
	the interpretability. Unlike previous approaches that induce sparsity, the
	rotated vectors are interpretable while preserving the expressive performance
	of the original vectors. Furthermore, any prebuilt word vector representation
	can be rotated for improved interpretability. We apply rotation to skipgrams
	and glove and compare the expressive power and interpretability with the
	original vectors and the sparse overcomplete vectors. The results show that the
	rotated vectors outperform the original and the sparse overcomplete vectors for
	interpretability and expressiveness tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1041}
}

@InProceedings{alvarezmelis-jaakkola:2017:EMNLP2017,
  author    = {Alvarez-Melis, David  and  Jaakkola, Tommi},
  title     = {A causal framework for explaining the predictions of black-box sequence-to-sequence models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {412--421},
  abstract  = {We interpret the predictions of any black-box structured input-structured
	output model around a specific input-output pair. Our method returns an
	"explanation" consisting of groups of input-output tokens that are causally
	related.  These dependencies are inferred by querying the model with perturbed
	inputs, generating a graph over tokens from the responses, and solving a
	partitioning problem to select the most relevant components. We focus the
	general approach on sequence-to-sequence problems, adopting a variational
	autoencoder to yield meaningful input perturbations. We test our method across
	several NLP sequence generation tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1042}
}

@InProceedings{serban-EtAl:2017:EMNLP2017,
  author    = {Serban, Iulian Vlad  and  Ororbia, Alexander G.  and  Pineau, Joelle  and  Courville, Aaron},
  title     = {Piecewise Latent Variables for Neural Variational Text Processing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {422--432},
  abstract  = {Advances in neural variational inference have facilitated the learning of
	powerful directed graphical models with continuous latent variables, such as
	variational autoencoders. The hope is that such models will learn to represent
	rich, multi-modal latent factors in real-world data, such as natural language
	text. However, current models often assume simplistic priors on the latent
	variables - such as the uni-modal Gaussian distribution - which are incapable
	of representing complex latent factors efficiently. To overcome this
	restriction, we propose the simple, but highly flexible, piecewise constant
	distribution. This distribution has the capacity to represent an exponential
	number of modes of a latent target distribution, while remaining mathematically
	tractable. Our results demonstrate that incorporating this new latent
	distribution into different models yields substantial improvements in natural
	language processing tasks such as document modeling and natural language
	generation for dialogue.},
  url       = {https://www.aclweb.org/anthology/D17-1043}
}

@InProceedings{lavergne-yvon:2017:EMNLP2017,
  author    = {Lavergne, Thomas  and  Yvon, Fran\c{c}ois},
  title     = {Learning the Structure of Variable-Order CRFs: a finite-state perspective},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {433--439},
  abstract  = {The computational complexity of linear-chain Conditional Random Fields (CRFs)
	makes it difficult to deal with very large label sets and long range
	dependencies. Such situations are not rare and arise when dealing with
	morphologically rich languages or joint labelling tasks. We extend here recent
	proposals to consider variable order CRFs. Using an effective finite-state
	representation of variable-length dependencies, we propose new ways to perform
	feature selection at large scale and report experimental results where we
	outperform strong baselines on a tagging task.},
  url       = {https://www.aclweb.org/anthology/D17-1044}
}

@InProceedings{aji-heafield:2017:EMNLP2017,
  author    = {Aji, Alham Fikri  and  Heafield, Kenneth},
  title     = {Sparse Communication for Distributed Gradient Descent},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {440--445},
  abstract  = {We make distributed stochastic gradient descent faster by exchanging sparse
	updates instead of dense updates. Gradient updates are positively skewed as
	most updates are near zero, so we map the 99% smallest updates (by absolute
	value) to zero then exchange sparse matrices. This method can be combined with
	quantization to further improve the compression. We explore different
	configurations and apply them to neural machine translation and MNIST image
	classification tasks. Most configurations work on MNIST, whereas different
	configurations reduce convergence rate on the more complex translation task.
	Our experiments show that we can achieve up to 49% speed up on MNIST and 22% on
	NMT without damaging the final accuracy or BLEU.},
  url       = {https://www.aclweb.org/anthology/D17-1045}
}

@InProceedings{lu-lund-boydgraber:2017:EMNLP2017,
  author    = {Lu, You  and  Lund, Jeffrey  and  Boyd-Graber, Jordan},
  title     = {Why ADAGRAD Fails for Online Topic Modeling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {446--451},
  abstract  = {Online topic modeling, i.e., topic modeling
	with stochastic variational inference, is a
	powerful and efficient technique for analyzing
	large datasets, and ADAGRAD is a
	widely-used technique for tuning learning
	rates during online gradient optimization.
	However, these two techniques do not work
	well together. We show that this is because
	ADAGRAD uses accumulation of previous
	gradients as the learning rates’ denominators.
	For online topic modeling, the magnitude
	of gradients is very large. It causes
	learning rates to shrink very quickly, so the
	parameters cannot fully converge until the
	training ends},
  url       = {https://www.aclweb.org/anthology/D17-1046}
}

@InProceedings{chen-EtAl:2017:EMNLP20171,
  author    = {Chen, Peng  and  Sun, Zhongqian  and  Bing, Lidong  and  Yang, Wei},
  title     = {Recurrent Attention Network on Memory for Aspect Sentiment Analysis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {452--461},
  abstract  = {We propose a novel framework based on neural networks to identify the sentiment
	of opinion targets in a comment/review. Our framework adopts multiple-attention
	mechanism to capture sentiment features separated by a long distance, so that
	it is more robust against irrelevant information. The results of multiple
	attentions are non-linearly combined with a recurrent neural network, which
	strengthens the expressive power of our model for handling more complications.
	The weighted-memory mechanism not only helps us avoid the labor-intensive
	feature engineering work, but also provides a tailor-made memory for different
	opinion targets of a sentence. We examine the merit of our model on four
	datasets: two are from SemEval2014, i.e. reviews of restaurants and laptops; a
	twitter dataset, for testing its performance on social media data; and a
	Chinese news comment dataset, for testing its language sensitivity. The
	experimental results show that our model consistently outperforms the
	state-of-the-art methods on different types of data.},
  url       = {https://www.aclweb.org/anthology/D17-1047}
}

@InProceedings{long-EtAl:2017:EMNLP20171,
  author    = {Long, Yunfei  and  Qin, Lu  and  Xiang, Rong  and  Li, Minglei  and  Huang, Chu-Ren},
  title     = {A Cognition Based Attention Model for Sentiment Analysis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {462--471},
  abstract  = {Attention models are proposed in sentiment analysis because some words are more
	important than others. However,most existing methods either use local context
	based text information or user preference information. In this work, we propose
	a novel attention model trained by cognition grounded eye-tracking data. A
	reading prediction model is first built using eye-tracking data as dependent
	data and other features in the context as independent data. The predicted
	reading time is then used to build a cognition based attention (CBA) layer for
	neural sentiment analysis. As a comprehensive model, We can capture attentions
	of words in sentences as well as sentences in documents. Different attention
	mechanisms can also be incorporated to capture other aspects of attentions.
	Evaluations show the CBA based method outperforms the state-of-the-art local
	context based attention methods significantly. This brings insight to how
	cognition grounded data can be brought into NLP tasks.
	Author{4}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1048}
}

@InProceedings{poddar-hsu-lee:2017:EMNLP2017,
  author    = {Poddar, Lahari  and  Hsu, Wynne  and  Lee, Mong Li},
  title     = {Author-aware Aspect Topic Sentiment Model to Retrieve Supporting Opinions from Reviews},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {472--481},
  abstract  = {User generated content about products and services in the form of reviews are
	often diverse and even contradictory. This makes it difficult for users to know
	if an opinion in a review is prevalent or biased.
	We study the problem of searching for supporting opinions in the context of
	reviews. We propose a framework called SURF, that first identifies opinions
	expressed in a review, and then finds similar opinions from other reviews. We
	design a novel probabilistic graphical model that captures opinions as a
	combination of aspect, topic and sentiment dimensions, takes into account the
	preferences of individual authors, as well as the quality of the  entity under
	review, and encodes the flow of thoughts in a review by constraining the aspect
	distribution dynamically among successive review segments. We derive a
	similarity measure that  considers both lexical and semantic similarity to find
	supporting opinions. Experiments on TripAdvisor hotel reviews and Yelp
	restaurant reviews  show that  our model outperforms  existing methods for
	modeling opinions, and the proposed framework is effective in finding
	supporting opinions.},
  url       = {https://www.aclweb.org/anthology/D17-1049}
}

@InProceedings{ghosh-veale:2017:EMNLP2017,
  author    = {Ghosh, Aniruddha  and  Veale, Tony},
  title     = {Magnets for Sarcasm: Making Sarcasm Detection Timely, Contextual and Very Personal},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {482--491},
  abstract  = {Sarcasm is a pervasive phenomenon in social media, permitting the concise
	communication of meaning, affect and attitude. Concision requires wit to
	produce and wit to understand, which demands from each party knowledge of
	norms, context and a speaker's mindset. Insight into a speaker's psychological
	profile at the time of production is a valuable source of context for sarcasm
	detection. Using a neural architecture, we show significant gains in detection
	accuracy when knowledge of the speaker's mood at the time of production can be
	inferred. Our focus is on sarcasm detection on Twitter, and show that the mood
	exhibited by a speaker over tweets leading up to a new post is as useful a cue
	for sarcasm as the topical context of the post itself. The work opens the door
	to an empirical exploration not just of sarcasm in text but of the sarcastic
	state of mind.},
  url       = {https://www.aclweb.org/anthology/D17-1050}
}

@InProceedings{morales-zhai:2017:EMNLP2017,
  author    = {Morales, Alex  and  Zhai, Chengxiang},
  title     = {Identifying Humor in Reviews using Background Text Sources},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {492--501},
  abstract  = {We study the problem of automatically identifying humorous text from a new kind
	of text data, i.e., online reviews. We propose a generative language model,
	based on the theory of incongruity, to model humorous text, which allows us to
	leverage background text sources, such as Wikipedia entry descriptions, and
	enables construction of multiple features for identifying humorous reviews.
	Evaluation of these features using supervised learning for classifying reviews
	into humorous and non-humorous reviews shows that the features constructed
	based on the proposed generative model are much more effective than the major
	features proposed in the existing literature, allowing us to achieve almost
	86\% accuracy. These humorous review predictions can also supply good
	indicators for identifying helpful reviews.},
  url       = {https://www.aclweb.org/anthology/D17-1051}
}

@InProceedings{wang-xia:2017:EMNLP2017,
  author    = {Wang, Leyi  and  Xia, Rui},
  title     = {Sentiment Lexicon Construction with Representation Learning Based on Hierarchical Sentiment Supervision},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {502--510},
  abstract  = {Sentiment lexicon is an important tool for identifying the sentiment polarity
	of words and texts. How to automatically construct sentiment lexicons has
	become a research topic in the field of sentiment analysis and opinion mining.
	Recently there were some attempts to employ representation learning algorithms
	to construct a sentiment lexicon with sentiment-aware word embedding. However,
	these methods were normally trained under document-level sentiment supervision.
	In this paper, we develop a neural architecture to train a sentiment-aware word
	embedding by integrating the sentiment supervision at both  document and word
	levels, to enhance the quality of word embedding as well as the sentiment
	lexicon. Experiments on the SemEval 2013-2016 datasets indicate that the
	sentiment lexicon generated by our approach achieves the state-of-the-art
	performance in both supervised and unsupervised sentiment classification, in
	comparison with several strong sentiment lexicon construction methods.},
  url       = {https://www.aclweb.org/anthology/D17-1052}
}

@InProceedings{xu-wan:2017:EMNLP2017,
  author    = {Xu, Kui  and  Wan, Xiaojun},
  title     = {Towards a Universal Sentiment Classifier in Multiple languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {511--520},
  abstract  = {Existing sentiment classifiers usually work for only one specific language, and
	different classification models are used in different languages. In this paper
	we aim to build a universal sentiment classifier with a single classification
	model in multiple different languages. In order to achieve this goal, we
	propose to learn multilingual sentiment-aware word embeddings simultaneously
	based only on the labeled reviews in English and unlabeled parallel data
	available in a few language pairs. It is not required that the parallel data
	exist between English and any other language, because the sentiment information
	can be transferred into any language via pivot languages. We present the
	evaluation results of our universal sentiment classifier in five languages, and
	the results are very promising even when the parallel data between English and
	the target languages are not used. Furthermore, the universal single classifier
	is compared with a few cross-language sentiment classifiers relying on direct
	parallel data between the source and target languages, and the results show
	that the performance of our universal sentiment classifier is very promising
	compared to that of different cross-language classifiers in multiple target
	languages.},
  url       = {https://www.aclweb.org/anthology/D17-1053}
}

@InProceedings{dou:2017:EMNLP2017,
  author    = {Dou, Zi-Yi},
  title     = {Capturing User and Product Information for Document Level Sentiment Analysis with Deep Memory Network},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {521--526},
  abstract  = {Document-level sentiment classification is a fundamental problem which aims to
	predict a user's overall sentiment about a 
	product in a document. Several methods have been proposed to tackle the problem
	whereas most of them fail to consider the influence of users who express the
	sentiment and products which are evaluated. To address the issue,
	we propose a deep memory network for document-level sentiment classification
	which could capture the user and product information at the same time. To prove
	the effectiveness of our algorithm, we conduct experiments on IMDB and Yelp
	datasets and the results indicate that our model can achieve better performance
	than several existing methods.},
  url       = {https://www.aclweb.org/anthology/D17-1054}
}

@InProceedings{yang-EtAl:2017:EMNLP20171,
  author    = {Yang, Min  and  Mei, Jincheng  and  Ji, Heng  and  wei, zhao  and  Zhao, Zhou  and  Chen, Xiaojun},
  title     = {Identifying and Tracking Sentiments and Topics from Social Media Texts during Natural Disasters},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {527--533},
  abstract  = {We study the problem of identifying the topics and sentiments and tracking
	their shifts from social media texts in different geographical regions during
	emergencies and disasters. We propose a location-based dynamic sentiment-topic
	model (LDST) which can jointly model topic, sentiment, time and Geolocation
	information. The experimental results demonstrate that LDST performs very well
	at discovering topics and sentiments from social media and tracking their
	shifts in different geographical regions during emergencies and disasters. We
	will release the data and source code after this work is published.
	Author{5}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1055}
}

@InProceedings{yu-EtAl:2017:EMNLP20172,
  author    = {Yu, Liang-Chih  and  Wang, Jin  and  Lai, K. Robert  and  Zhang, Xuejie},
  title     = {Refining Word Embeddings for Sentiment Analysis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {534--539},
  abstract  = {Word embeddings that can capture semantic and syntactic information from
	contexts have been extensively used for various natural language processing
	tasks. However, existing methods for learning context-based word embeddings
	typically fail to capture sufficient sentiment information. This may result in
	words with similar vector representations having an opposite sentiment polarity
	(e.g., good and bad), thus degrading sentiment analysis performance. Therefore,
	this study proposes a word vector refinement model that can be applied to any
	pre-trained word vectors (e.g., Word2vec and GloVe). The refinement model is
	based on adjusting the vector representations of words such that they can be
	closer to both semantically and sentimentally similar words and further away
	from sentimentally dissimilar words. Experimental results show that the
	proposed method can improve conventional word embeddings and outperform
	previously proposed sentiment embeddings for both binary and fine-grained
	classification on Stanford Sentiment Treebank (SST).},
  url       = {https://www.aclweb.org/anthology/D17-1056}
}

@InProceedings{akhtar-EtAl:2017:EMNLP2017,
  author    = {Akhtar, Md Shad  and  Kumar, Abhishek  and  Ghosal, Deepanway  and  Ekbal, Asif  and  Bhattacharyya, Pushpak},
  title     = {A Multilayer Perceptron based Ensemble Technique for Fine-grained Financial Sentiment Analysis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {540--546},
  abstract  = {In this paper, we propose a novel method for combining deep learning and
	classical feature based models using a Multi-Layer Perceptron (MLP) network for
	financial sentiment analysis. We develop various deep learning models based on
	Convolutional Neural Network (CNN), Long Short Term Memory (LSTM) and Gated
	Recurrent Unit (GRU). These are trained on top of pre-trained,
	autoencoder-based, financial word embeddings and lexicon features. An ensemble
	is constructed by combining these deep learning models and a classical
	supervised model based on Support Vector Regression (SVR). We evaluate our
	proposed technique on a benchmark dataset of SemEval-2017 shared task on
	financial sentiment analysis. The propose model shows impressive results on two
	datasets, i.e. microblogs and news headlines datasets. Comparisons show that
	our proposed model performs better than the existing state-of-the-art systems
	for the above two datasets by 2.0 and 4.1 cosine points, respectively.},
  url       = {https://www.aclweb.org/anthology/D17-1057}
}

@InProceedings{sharma-EtAl:2017:EMNLP2017,
  author    = {Sharma, Raksha  and  Somani, Arpan  and  Kumar, Lakshya  and  Bhattacharyya, Pushpak},
  title     = {Sentiment Intensity Ranking among Adjectives Using Sentiment Bearing Word Embeddings},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {547--552},
  abstract  = {Identification of intensity ordering among polar (positive or negative) words
	which have the same semantics can lead to a fine-grained sentiment analysis.
	For
	example, 'master', 'seasoned' and 'familiar' point to different intensity
	levels, though they all convey the same meaning (semantics), i.e., expertise:
	having a good
	knowledge of. In this paper, we propose a semi-supervised technique that uses
	sentiment
	bearing word embeddings to produce a continuous ranking among adjectives that
	share common semantics. Our system demonstrates a strong Spearman’s rank
	correlation of 0.83 with the gold standard ranking. We show that sentiment
	bearing word embeddings facilitate a more accurate intensity ranking system
	than other standard word embeddings (word2vec and GloVe). Word2vec is the
	state-of-the-art for intensity ordering task.},
  url       = {https://www.aclweb.org/anthology/D17-1058}
}

@InProceedings{wang-zhang-liu:2017:EMNLP2017,
  author    = {Wang, Yasheng  and  Zhang, Yang  and  Liu, Bing},
  title     = {Sentiment Lexicon Expansion Based on Neural PU Learning, Double Dictionary Lookup, and Polarity Association},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {553--563},
  abstract  = {Although many sentiment lexicons in different languages exist, most are not
	comprehensive. In a recent sentiment analysis application, we used a large
	Chinese sentiment lexicon and found that it missed a large number of sentiment
	words in social media. This prompted us to make a new attempt to study
	sentiment lexicon expansion. This paper first poses the problem as a PU
	learning problem, which is a new formulation. It then proposes a new PU
	learning method suitable for our problem using a neural network. The results
	are enhanced further with a new dictionary-based technique and a novel polarity
	classification technique. Experimental results show that the proposed approach
	outperforms baseline methods greatly.},
  url       = {https://www.aclweb.org/anthology/D17-1059}
}

@InProceedings{xiong-hoang-wang:2017:EMNLP2017,
  author    = {Xiong, Wenhan  and  Hoang, Thien  and  Wang, William Yang},
  title     = {DeepPath: A Reinforcement Learning Method for Knowledge Graph Reasoning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {564--573},
  abstract  = {We study the problem of learning to reason in large scale knowledge graphs
	(KGs). More specifically, we describe a novel reinforcement learning framework
	for learning multi-hop relational paths: we use a policy-based agent with
	continuous states based on knowledge graph embeddings, which reasons in a KG
	vector-space by sampling the most promising relation to extend its path. In
	contrast to prior work, our approach includes a reward function that takes the
	accuracy, diversity, and efficiency into consideration. Experimentally, we show
	that our proposed method outperforms a path-ranking based algorithm and
	knowledge graph embedding methods on Freebase and Never-Ending Language
	Learning datasets.},
  url       = {https://www.aclweb.org/anthology/D17-1060}
}

@InProceedings{nogueira-cho:2017:EMNLP2017,
  author    = {Nogueira, Rodrigo  and  Cho, Kyunghyun},
  title     = {Task-Oriented Query Reformulation with Reinforcement Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {574--583},
  abstract  = {Search engines play an important role in our everyday lives by assisting us in
	finding the information we need. When we input a complex query, however,
	results are often far from satisfactory. In this work, we introduce a query
	reformulation system based on a neural network that rewrites a query to
	maximize the number of relevant documents returned.
	We train this neural network with reinforcement learning. The actions
	correspond to selecting terms to build a reformulated query, and the reward is
	the document recall. We evaluate our approach on three datasets against strong
	baselines and show a relative improvement of 5-20% in terms of recall.
	Furthermore, we present a simple method to estimate a conservative upper-bound
	performance of a model in a particular environment and verify that there is
	still large room for improvements.},
  url       = {https://www.aclweb.org/anthology/D17-1061}
}

@InProceedings{zhang-lapata:2017:EMNLP2017,
  author    = {Zhang, Xingxing  and  Lapata, Mirella},
  title     = {Sentence Simplification with Deep Reinforcement Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {584--594},
  abstract  = {Sentence simplification aims to make sentences easier to read and
	  understand. Most recent approaches draw on insights from machine
	  translation to learn simplification rewrites from monolingual
	  corpora of complex and simple sentences. We address the
	  simplification problem with an encoder-decoder model coupled with a
	  deep reinforcement learning framework. Our model, which we call {\sc
	    Dress} (as shorthand for {\bf D}eep {\bf RE}inforcement {\bf
	    S}entence {\bf S}implification), explores the space of possible
	  simplifications while learning to optimize a reward function that
	  encourages outputs which are simple, fluent, and preserve the
	  meaning of the input. Experiments on three datasets demonstrate that
	  our model outperforms competitive simplification
	  systems.},
  url       = {https://www.aclweb.org/anthology/D17-1062}
}

@InProceedings{fang-li-cohn:2017:EMNLP2017,
  author    = {Fang, Meng  and  Li, Yuan  and  Cohn, Trevor},
  title     = {Learning how to Active Learn: A Deep Reinforcement Learning Approach},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {595--605},
  abstract  = {Active learning aims to select a small subset of data for annotation such that
	a classifier learned on the data is highly accurate. This is usually done using
	heuristic selection methods, however the effectiveness of such methods is
	limited and moreover, the performance of heuristics varies between datasets. To
	address these shortcomings, we introduce a novel formulation by reframing the
	active learning as a reinforcement learning problem and explicitly learning a
	data selection policy, where the policy takes the role of the active learning
	heuristic. Importantly, our method allows the selection policy learned using
	simulation to one language to be transferred to other languages. We demonstrate
	our method using cross-lingual named entity recognition, observing uniform
	improvements over traditional active learning algorithms.},
  url       = {https://www.aclweb.org/anthology/D17-1063}
}

@InProceedings{narayan-EtAl:2017:EMNLP2017,
  author    = {Narayan, Shashi  and  Gardent, Claire  and  Cohen, Shay B.  and  Shimorina, Anastasia},
  title     = {Split and Rephrase},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {606--616},
  abstract  = {We propose a new sentence simplification task (Split-and-Rephrase) where the
	aim is to split a complex sentence into a meaning preserving sequence of
	shorter sentences. Like sentence simplification, splitting-and-rephrasing has
	the potential of benefiting both natural language processing and societal
	applications. Because shorter sentences are generally better processed by NLP
	systems, it could be used as a preprocessing step which facilitates and
	improves the performance of parsers, semantic role labellers and machine
	translation systems. It should also be of use for people with reading
	disabilities because it allows the conversion of longer sentences into shorter
	ones. This paper makes two contributions towards this new task. First, we
	create and make available a benchmark consisting of 1,066,115 tuples mapping a
	single complex sentence to a sequence of sentences expressing the same meaning.
	Second, we propose five models (vanilla sequence-to-sequence to
	semantically-motivated models) to understand the difficulty of the proposed
	task.},
  url       = {https://www.aclweb.org/anthology/D17-1064}
}

@InProceedings{xu-EtAl:2017:EMNLP2017,
  author    = {Xu, Zhen  and  Liu, Bingquan  and  Wang, Baoxun  and  SUN, Chengjie  and  Wang, Xiaolong  and  Wang, Zhuoran  and  Qi, Chao},
  title     = {Neural Response Generation via GAN with an Approximate Embedding Layer},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {617--626},
  abstract  = {This paper presents a Generative Adversarial Network (GAN) to model single-turn
	short-text conversations, which trains a sequence-to-sequence (Seq2Seq) network
	for response generation simultaneously with a discriminative classifier that
	measures the differences between human-produced responses and machine-generated
	ones. In addition, the proposed method introduces an approximate embedding
	layer to solve the non-differentiable problem caused by the sampling-based
	output decoding procedure in the Seq2Seq generative model. The GAN setup
	provides an effective way to avoid noninformative responses (a.k.a “safe
	responses”), which are frequently observed in traditional neural response
	generators.
	The experimental results show that the proposed approach significantly
	outperforms existing neural response generation models in diversity metrics,
	with slight increases in relevance scores as well, when evaluated on both a
	Mandarin corpus and an English corpus.},
  url       = {https://www.aclweb.org/anthology/D17-1065}
}

@InProceedings{semeniuta-severyn-barth:2017:EMNLP2017,
  author    = {Semeniuta, Stanislau  and  Severyn, Aliaksei  and  Barth, Erhardt},
  title     = {A Hybrid Convolutional Variational Autoencoder for Text Generation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {627--637},
  abstract  = {In this paper we explore the effect of architectural choices on learning a
	variational autoencoder (VAE) for text generation. In contrast to the
	previously introduced VAE model for text where both the encoder and decoder are
	RNNs, we propose a novel hybrid architecture that blends fully feed-forward
	convolutional and deconvolutional components with a recurrent language model.
	Our architecture exhibits several attractive properties such as faster run time
	and convergence, ability to better handle long sequences and, more importantly,
	it helps to avoid the issue of the VAE collapsing to a deterministic model.},
  url       = {https://www.aclweb.org/anthology/D17-1066}
}

@InProceedings{hossain-EtAl:2017:EMNLP2017,
  author    = {Hossain, Nabil  and  Krumm, John  and  Vanderwende, Lucy  and  Horvitz, Eric  and  Kautz, Henry},
  title     = {Filling the Blanks (hint: plural noun) for Mad Libs Humor},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {638--647},
  abstract  = {Computerized generation of humor is a notoriously difficult AI problem. We
	develop an algorithm called Libitum that helps humans generate humor in a Mad
	Lib, which is a popular fill-in-the-blank game. The algorithm is based on a
	machine learned classifier that determines whether a potential fill-in word is
	funny in the context of the Mad Lib story. We use Amazon Mechanical Turk to
	create ground truth data and to judge humor for our classifier to mimic, and we
	make this data freely available. Our testing shows that Libitum successfully
	aids humans in filling in Mad Libs that are usually judged funnier than those
	filled in by humans with no computerized help. We go on to analyze why some
	words are better than others at making a Mad Lib funny.},
  url       = {https://www.aclweb.org/anthology/D17-1067}
}

@InProceedings{santus-EtAl:2017:EMNLP2017,
  author    = {Santus, Enrico  and  Chersoni, Emmanuele  and  Lenci, Alessandro  and  Blache, Philippe},
  title     = {Measuring Thematic Fit with Distributional Feature Overlap},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {648--658},
  abstract  = {In this paper, we introduce a new distributional method for modeling
	predicate-argument thematic fit judgments. 
	We use a syntax-based DSM to build a prototypical representation of
	verb-specific roles: for every verb, we extract the most salient second order
	contexts for each of its roles (i.e. the most salient dimensions of typical
	role fillers), and then we compute thematic fit as a weighted overlap between
	the top features of candidate fillers and role prototypes.
	Our experiments show that our method consistently outperforms a baseline
	re-implementing a state-of-the-art system, and achieves better or comparable
	results to those reported in the literature for the other unsupervised systems.
	Moreover, it provides an explicit representation of the features characterizing
	verb-specific semantic roles.},
  url       = {https://www.aclweb.org/anthology/D17-1068}
}

@InProceedings{mekala-EtAl:2017:EMNLP2017,
  author    = {Mekala, Dheeraj  and  Gupta, Vivek  and  Paranjape, Bhargavi  and  Karnick, Harish},
  title     = {SCDV : Sparse Composite Document Vectors using soft clustering over distributional representations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {659--669},
  abstract  = {We present a feature vector formation technique for documents - Sparse
	Composite Document Vector (SCDV) - which overcomes several shortcomings of the
	current distributional paragraph vector representations that are widely used
	for text representation. In SCDV, word embeddings are clustered to capture
	multiple semantic contexts in which words occur. They are then chained together
	to form document topic-vectors that can express complex, multi-topic documents.
	Through extensive experiments on multi-class and multi-label classification
	tasks, we outperform the previous state-of-the-art method, NTSG. We also show
	that SCDV embeddings perform well on heterogeneous tasks like Topic Coherence,
	context-sensitive Learning and Information Retrieval. Moreover, we achieve a
	significant reduction in training and prediction times compared to other
	representation methods. SCDV achieves best of both worlds - better performance
	with lower time and space complexity.},
  url       = {https://www.aclweb.org/anthology/D17-1069}
}

@InProceedings{conneau-EtAl:2017:EMNLP2017,
  author    = {Conneau, Alexis  and  Kiela, Douwe  and  Schwenk, Holger  and  Barrault, Lo\"{i}c  and  Bordes, Antoine},
  title     = {Supervised Learning of Universal Sentence Representations from Natural Language Inference Data},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {670--680},
  abstract  = {Many modern NLP systems rely on word embeddings, previously trained in an
	unsupervised manner on large corpora, as base features. Efforts to obtain
	embeddings for larger chunks of text, such as sentences, have however not been
	so successful. Several attempts at learning unsupervised representations of
	sentences have not reached satisfactory enough performance to be widely
	adopted.
	In this paper, we show how universal sentence representations trained using the
	supervised data of the Stanford Natural Language Inference datasets can
	consistently outperform unsupervised methods like SkipThought vectors on a wide
	range of transfer tasks. Much like how computer vision uses ImageNet to obtain
	features, which can then be transferred to other tasks, our work tends to
	indicate the suitability of natural language inference for transfer learning to
	other NLP tasks. Our encoder is publicly available.},
  url       = {https://www.aclweb.org/anthology/D17-1070}
}

@InProceedings{yanaka-EtAl:2017:EMNLP2017,
  author    = {Yanaka, Hitomi  and  Mineshima, Koji  and  Mart\'{i}nez-G\'{o}mez, Pascual  and  Bekki, Daisuke},
  title     = {Determining Semantic Textual Similarity using Natural Deduction Proofs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {681--691},
  abstract  = {Determining semantic textual similarity is a core research subject in natural
	language processing.
	Since vector-based models for sentence representation often use shallow
	information, capturing accurate semantics is difficult. By contrast, logical
	semantic representations capture deeper levels of sentence semantics, but their
	symbolic nature does not offer graded notions of textual similarity.
	We propose a method for determining semantic textual similarity by combining
	shallow features with features extracted from natural deduction proofs of
	bidirectional entailment relations between sentence pairs. For the natural
	deduction proofs, we use ccg2lambda, a higher-order automatic inference system,
	which converts Combinatory Categorial Grammar (CCG) derivation trees into
	semantic representations and conducts natural deduction proofs. Experiments
	show that our system was able to outperform other logic-based systems and that
	features derived from the proofs are effective for learning textual similarity.},
  url       = {https://www.aclweb.org/anthology/D17-1071}
}

@InProceedings{gong-EtAl:2017:EMNLP2017,
  author    = {Gong, Chen  and  Li, Zhenghua  and  Zhang, Min  and  Jiang, Xinzhou},
  title     = {Multi-Grained Chinese Word Segmentation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {692--703},
  abstract  = {Traditionally, word segmentation (WS) adopts the single-grained formalism,
	where a sentence corresponds to a single word sequence. However, Sproat et al.
	(1997) show that the inter-native-speaker consistency ratio over Chinese word
	boundaries is only 76\%, indicating single-grained WS (SWS) imposes unnecessary
	challenges on both manual annotation and statistical modeling.
	Moreover, WS results of different granularities can be complementary and
	beneficial for high-level applications. 
	This work proposes and addresses multi-grained WS (MWS). We build a large-scale
	pseudo MWS dataset for model training and tuning by leveraging the annotation
	heterogeneity of 
	three SWS datasets.
	Then we manually annotate 1,500 test sentences with true MWS annotations. 
	Finally, we propose three benchmark approaches by casting MWS as constituent
	parsing and sequence
	labeling. 
	Experiments and analysis lead to many interesting findings.},
  url       = {https://www.aclweb.org/anthology/D17-1072}
}

@InProceedings{zalmout-habash:2017:EMNLP2017,
  author    = {Zalmout, Nasser  and  Habash, Nizar},
  title     = {Don't Throw Those Morphological Analyzers Away Just Yet: Neural Morphological Disambiguation for Arabic},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {704--713},
  abstract  = {This paper presents a model for Arabic morphological disambiguation based on
	Recurrent Neural Networks (RNN). We train Long Short-Term Memory (LSTM) cells
	in several configurations and embedding levels to model the various
	morphological features. Our experiments show that these models outperform
	state-of-the-art systems without explicit use of feature engineering. However,
	adding learning features from a morphological analyzer to model the space of
	possible analyses provides additional improvement.
	We make use of the resulting morphological models for scoring and ranking the
	analyses of the morphological analyzer for morphological disambiguation. The
	results show significant gains in accuracy across several evaluation metrics.
	Our system results in 4.4% absolute increase over the state-of-the-art in full
	morphological analysis accuracy (30.6% relative error reduction), and 10.6% 
	(31.5% relative error reduction) for out-of-vocabulary words.},
  url       = {https://www.aclweb.org/anthology/D17-1073}
}

@InProceedings{cotterell-EtAl:2017:EMNLP2017,
  author    = {Cotterell, Ryan  and  Vylomova, Ekaterina  and  Khayrallah, Huda  and  Kirov, Christo  and  Yarowsky, David},
  title     = {Paradigm Completion for Derivational Morphology},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {714--720},
  abstract  = {The generation of complex derived word forms has been an overlooked problem in
	NLP; we fill this gap by applying neural sequence-to-sequence models to the
	task. We overview the theoretical motivation for a paradigmatic treatment of
	derivational morphology, and introduce the task of derivational paradigm
	completion as a parallel to inflectional paradigm completion. State-of-the-art
	neural models adapted from the inflection task are able to learn the range of
	derivation patterns, and outperform a non-neural baseline by 16.4%. However,
	due to semantic, historical, and lexical considerations involved in
	derivational morphology, future work will be needed to achieve performance
	parity with inflection-generating systems.},
  url       = {https://www.aclweb.org/anthology/D17-1074}
}

@InProceedings{stratos:2017:EMNLP2017,
  author    = {Stratos, Karl},
  title     = {A Sub-Character Architecture for Korean Language Processing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {721--726},
  abstract  = {We introduce a novel sub-character architecture that exploits a unique
	compositional structure of the Korean language. Our method decomposes each
	character into a small set of primitive phonetic units called jamo letters from
	which character- and word-level representations are induced. The jamo letters
	divulge syntactic and semantic information that is difficult to access with
	conventional character-level units. They greatly alleviate the data sparsity
	problem, reducing the observation space to 1.6% of the original while
	increasing accuracy in our experiments. We apply our architecture to dependency
	parsing and achieve dramatic improvement over strong lexical baselines.},
  url       = {https://www.aclweb.org/anthology/D17-1075}
}

@InProceedings{horsmann-zesch:2017:EMNLP2017,
  author    = {Horsmann, Tobias  and  Zesch, Torsten},
  title     = {Do LSTMs really work so well for PoS tagging? -- A replication study},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {727--736},
  abstract  = {A recent study by Plank et al. (2016) found that LSTM-based PoS taggers
	considerably improve over the current state-of-the-art when evaluated on the
	corpora of the Universal Dependencies project that use a coarse-grained tagset.
	We replicate this study using a fresh collection of 27 corpora of 21 languages
	that are annotated with fine-grained tagsets of varying size.
	Our replication confirms the result in general, and we additionally find that
	the advantage of LSTMs is even bigger for larger tagsets.
	However, we also find that for the very large tagsets of morphologically rich
	languages, hand-crafted morphological lexicons are still necessary to reach
	state-of-the-art performance.},
  url       = {https://www.aclweb.org/anthology/D17-1076}
}

@InProceedings{mcconnaughey-dai-bamman:2017:EMNLP2017,
  author    = {McConnaughey, Lara  and  Dai, Jennifer  and  Bamman, David},
  title     = {The Labeled Segmentation of Printed Books},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {737--747},
  abstract  = {We introduce the task of book structure labeling: segmenting and assigning a
	fixed category (such as Table of Contents, Preface, Index) to the document
	structure of printed books.  We manually annotate the page-level structural
	categories for a large dataset totaling 294,816 pages in 1,055 books evenly
	sampled from 1750-1922, and present empirical results comparing the performance
	of several classes of models.  The best-performing model, a bidirectional LSTM
	with rich features, achieves an overall accuracy of 95.8 and a class-balanced
	macro F-score of 71.4.},
  url       = {https://www.aclweb.org/anthology/D17-1077}
}

@InProceedings{cotterell-heigold:2017:EMNLP2017,
  author    = {Cotterell, Ryan  and  Heigold, Georg},
  title     = {Cross-lingual Character-Level Neural Morphological Tagging},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {748--759},
  abstract  = {Even for common NLP tasks, sufficient supervision is not available in many
	languages -- morphological tagging is no exception. In the work presented here,
	we explore a transfer learning scheme, whereby we train character-level
	recurrent neural taggers to predict morphological taggings for high-resource
	languages and low-resource languages together. Learning joint character
	representations among multiple related languages successfully enables knowledge
	transfer from the high-resource languages to the low-resource ones.},
  url       = {https://www.aclweb.org/anthology/D17-1078}
}

@InProceedings{zhou-EtAl:2017:EMNLP2017,
  author    = {Zhou, Hao  and  Yu, Zhenting  and  Zhang, Yue  and  Huang, Shujian  and  DAI, XIN-YU  and  Chen, Jiajun},
  title     = {Word-Context Character Embeddings for Chinese Word Segmentation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {760--766},
  abstract  = {Neural parsers have benefited from automatically labeled data via
	dependency-context word embeddings. 
	We investigate training character embeddings on a word-based context in a
	similar way, showing that the simple method improves state-of-the-art neural
	word segmentation models significantly, beating tri-training baselines for
	leveraging auto-segmented data.},
  url       = {https://www.aclweb.org/anthology/D17-1079}
}

@InProceedings{oshikiri:2017:EMNLP2017,
  author    = {Oshikiri, Takamasa},
  title     = {Segmentation-Free Word Embedding for Unsegmented Languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {767--772},
  abstract  = {In this paper, we propose a new pipeline of word embedding for unsegmented
	languages, called segmentation-free word embedding, which does not require word
	segmentation as a preprocessing step. Unlike space-delimited languages,
	unsegmented languages, such as Chinese and Japanese, require word segmentation
	as a preprocessing step. However, word segmentation, that often requires
	manually annotated resources, is difficult and expensive, and unavoidable
	errors in word segmentation affect downstream tasks. To avoid these problems in
	learning word vectors of unsegmented languages, we consider word co-occurrence
	statistics over all possible candidates of segmentations based on frequent
	character n-grams instead of segmented sentences provided by conventional word
	segmenters. Our experiments of noun category prediction tasks on raw Twitter,
	Weibo, and Wikipedia corpora show that the proposed method outperforms the
	conventional approaches that require word segmenters.},
  url       = {https://www.aclweb.org/anthology/D17-1080}
}

@InProceedings{sachan-dubey-xing:2017:EMNLP2017,
  author    = {Sachan, Mrinmaya  and  Dubey, Kumar  and  Xing, Eric},
  title     = {From Textbooks to Knowledge: A Case Study in Harvesting Axiomatic Knowledge from Textbooks to Solve Geometry Problems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {773--784},
  abstract  = {Textbooks are rich sources of information. Harvesting structured knowledge from
	textbooks is a key challenge in many educational applications. As a case study,
	we present an approach for harvesting structured axiomatic knowledge from math
	textbooks. Our approach uses rich contextual and typographical features
	extracted from raw textbooks. It leverages the redundancy and shared ordering
	across multiple textbooks to further refine the harvested axioms. These axioms
	are then parsed into rules that are used to improve the state-of-the-art in
	solving geometry problems.},
  url       = {https://www.aclweb.org/anthology/D17-1081}
}

@InProceedings{lai-EtAl:2017:EMNLP2017,
  author    = {Lai, Guokun  and  Xie, Qizhe  and  Liu, Hanxiao  and  Yang, Yiming  and  Hovy, Eduard},
  title     = {RACE: Large-scale ReAding Comprehension Dataset From Examinations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {785--794},
  abstract  = {We present RACE, a new dataset for benchmark evaluation of methods in the
	reading comprehension task. Collected from the English exams for middle and
	high school Chinese students in the age range between 12 to 18, RACE consists
	of near 28,000 passages and near 100,000 questions generated by human experts
	(English instructors), and covers a variety of topics which are carefully
	designed for evaluating the students' ability in  understanding and reasoning. 
	In particular, the proportion of questions that requires reasoning is much
	larger in RACE than that in other benchmark datasets for reading comprehension,
	and there is a significant gap between the performance of the state-of-the-art
	models (43%) and the ceiling human performance (95%). We hope this new dataset
	can serve as a valuable resource for research and evaluation in machine
	comprehension. The dataset is freely available at
	http://www.cs.cmu.edu/~glai1/data/race/ and the code is available at
	https://github.com/qizhex/RACE\_AR\_baselines.},
  url       = {https://www.aclweb.org/anthology/D17-1082}
}

@InProceedings{hopkins-EtAl:2017:EMNLP2017,
  author    = {Hopkins, Mark  and  Petrescu-Prahova, Cristian  and  Levin, Roie  and  Le Bras, Ronan  and  Herrasti, Alvaro  and  Joshi, Vidur},
  title     = {Beyond Sentential Semantic Parsing: Tackling the Math SAT with a Cascade of Tree Transducers},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {795--804},
  abstract  = {We present an approach for answering questions that span multiple sentences and
	exhibit sophisticated cross-sentence anaphoric phenomena, evaluating on a rich
	source of such questions -- the math portion of the Scholastic Aptitude Test
	(SAT). By using a tree transducer cascade as its basic architecture, our system
	propagates uncertainty from multiple sources (e.g. coreference resolution or
	verb interpretation) until it can be confidently resolved. Experiments show the
	first-ever results 43% recall and 91% precision) on SAT algebra word problems.
	We also apply our system to the public Dolphin algebra question set, and
	improve the state-of-the-art F1-score from 73.9% to 77.0%.},
  url       = {https://www.aclweb.org/anthology/D17-1083}
}

@InProceedings{huang-EtAl:2017:EMNLP20171,
  author    = {Huang, Danqing  and  Shi, Shuming  and  Lin, Chin-Yew  and  Yin, Jian},
  title     = {Learning Fine-Grained Expressions to Solve Math Word Problems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {805--814},
  abstract  = {This paper presents a novel template-based method to solve math word problems.
	This method learns the mappings between math concept phrases in math word
	problems and their math expressions from training data. For each equation
	template, we automatically construct a rich template sketch by aggregating
	information from various problems with the same template. Our approach is
	implemented in a two-stage system. It first retrieves a few relevant equation
	system templates and aligns numbers in math word problems to those templates
	for candidate equation generation. It then does a fine-grained inference to
	obtain the final answer. Experiment results show that our method achieves an
	accuracy of 28.4% on the linear Dolphin18K benchmark, which is 10% (54%
	relative) higher than previous state-of-the-art systems while achieving an
	accuracy
	increase of 12% (59% relative) on the TS6 benchmark subset.},
  url       = {https://www.aclweb.org/anthology/D17-1084}
}

@InProceedings{liu-EtAl:2017:EMNLP20172,
  author    = {Liu, Rui  and  Hu, Junjie  and  Wei, Wei  and  Yang, Zi  and  Nyberg, Eric},
  title     = {Structural Embedding of Syntactic Trees for Machine Comprehension},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {815--824},
  abstract  = {Deep neural networks for machine comprehension typically utilizes only word or
	character embeddings without explicitly taking advantage of structured
	linguistic information such as constituency trees and dependency trees. In this
	paper, we propose structural embedding of syntactic trees (SEST), an algorithm
	framework to utilize structured information and encode them into vector
	representations that can boost the performance of algorithms for the machine
	comprehension. We evaluate our approach using a state-of-the-art neural
	attention model on the SQuAD dataset. Experimental results demonstrate that our
	model can accurately identify the syntactic boundaries of the sentences and
	extract answers that are syntactically coherent over the baseline methods.},
  url       = {https://www.aclweb.org/anthology/D17-1085}
}

@InProceedings{long-EtAl:2017:EMNLP20172,
  author    = {Long, Teng  and  Bengio, Emmanuel  and  Lowe, Ryan  and  Cheung, Jackie Chi Kit  and  Precup, Doina},
  title     = {World Knowledge for Reading Comprehension: Rare Entity Prediction with Hierarchical LSTMs Using External Descriptions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {825--834},
  abstract  = {Humans interpret texts with respect to some background information, or world
	knowledge, and we would like to develop automatic reading comprehension systems
	that can do the same. In this paper, we introduce a task and several models to
	drive progress towards this goal. In particular, we propose the task of rare
	entity prediction: given a web document with several entities removed, models
	are tasked with predicting the correct missing entities conditioned on the
	document context and the lexical resources. This task is challenging due to the
	diversity of language styles and the extremely large number of rare entities.
	We propose two recurrent neural network architectures which make use of
	external knowledge in the form of entity descriptions. Our experiments show
	that our hierarchical LSTM model performs significantly better at the rare
	entity prediction task than those that do not make use of external resources.},
  url       = {https://www.aclweb.org/anthology/D17-1086}
}

@InProceedings{golub-EtAl:2017:EMNLP2017,
  author    = {Golub, David  and  Huang, Po-Sen  and  He, Xiaodong  and  Deng, Li},
  title     = {Two-Stage Synthesis Networks for Transfer Learning in Machine Comprehension},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {835--844},
  abstract  = {We develop a technique for transfer learning in machine comprehension (MC)
	using a novel two-stage synthesis network.  Given a high performing MC model in
	one domain, our technique aims to answer questions about documents in another
	domain, where we use no labeled data of question-answer pairs. Using the
	proposed synthesis network with a pretrained model on the SQuAD dataset, we
	achieve an F1 measure of 46.6% on the challenging NewsQA dataset, approaching
	performance of in-domain models (F1 measure of 50.0%) and outperforming the
	out-of-domain baseline by 7.6%, without use of provided annotations.},
  url       = {https://www.aclweb.org/anthology/D17-1087}
}

@InProceedings{wang-liu-shi:2017:EMNLP2017,
  author    = {Wang, Yan  and  Liu, Xiaojiang  and  Shi, Shuming},
  title     = {Deep Neural Solver for Math Word Problems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {845--854},
  abstract  = {This paper presents a deep neural solver to automatically solve math word
	problems. In contrast to previous statistical learning approaches, we directly
	translate math word problems to equation templates using a recurrent neural
	network (RNN) model, without sophisticated feature engineering. We further
	design a hybrid model that combines the RNN model and a similarity-based
	retrieval model to achieve additional performance improvement. Experiments
	conducted on a large dataset show that the RNN model and the hybrid model
	significantly outperform state-of-the-art statistical learning methods for math
	word problem solving.},
  url       = {https://www.aclweb.org/anthology/D17-1088}
}

@InProceedings{p-garg-shevade:2017:EMNLP2017,
  author    = {P, Deepak  and  Garg, Dinesh  and  Shevade, Shirish},
  title     = {Latent Space Embedding for Retrieval in Question-Answer Archives},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {855--865},
  abstract  = {Community-driven Question Answering (CQA) systems such as Yahoo! Answers have
	become valuable sources of reusable information. CQA retrieval enables usage of
	historical CQA archives to solve new questions posed by users. This task has
	received much recent attention, with methods building upon literature from
	translation models, topic models, and deep learning. In this paper, we devise a
	CQA retrieval technique, LASER-QA, that embeds question-answer pairs within a
	unified latent space preserving the local neighborhood structure of question
	and answer spaces. The idea is that such a space mirrors semantic similarity
	among questions as well as answers, thereby enabling high quality retrieval.
	Through an empirical analysis on various real-world QA datasets, we illustrate
	the improved effectiveness of LASER-QA over state-of-the-art methods.},
  url       = {https://www.aclweb.org/anthology/D17-1089}
}

@InProceedings{duan-EtAl:2017:EMNLP2017,
  author    = {Duan, Nan  and  Tang, Duyu  and  Chen, Peng  and  Zhou, Ming},
  title     = {Question Generation for Question Answering},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {866--874},
  abstract  = {This paper presents how to generate questions from given passages using neural
	networks, where large scale QA pairs are automatically crawled and processed
	from Community-QA website, and used as training data. The contribution of the
	paper is 2-fold: First, two types of question generation approaches are
	proposed, one is a retrieval-based method using convolution neural network
	(CNN), the other is a generation-based method using recurrent
	neural network (RNN); Second, we show how to leverage the generated questions
	to improve existing question answering systems. We evaluate our question
	generation method for the answer sentence selection task on three benchmark
	datasets, including SQuAD, MS MARCO, and WikiQA. Experimental results show
	that, by using generated questions as an extra signal, significant QA
	improvement can be achieved.},
  url       = {https://www.aclweb.org/anthology/D17-1090}
}

@InProceedings{dong-EtAl:2017:EMNLP2017,
  author    = {Dong, Li  and  Mallinson, Jonathan  and  Reddy, Siva  and  Lapata, Mirella},
  title     = {Learning to Paraphrase for Question Answering},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {875--886},
  abstract  = {Question answering (QA) systems are sensitive to the many different ways
	natural language expresses the same information need. In this paper we turn to
	paraphrases as a means of capturing this knowledge and present a general
	framework which learns felicitous paraphrases for various QA tasks. Our method
	is trained end-to-end using question-answer pairs as a supervision signal. A
	question and its paraphrases serve as input to a neural scoring model which
	assigns higher weights to linguistic expressions most likely to yield correct
	answers. We evaluate our approach on QA over Freebase and answer sentence
	selection. Experimental results on three datasets show that our framework
	consistently improves performance, achieving competitive results despite the
	use of simple QA models.},
  url       = {https://www.aclweb.org/anthology/D17-1091}
}

@InProceedings{meng-rumshisky-romanov:2017:EMNLP2017,
  author    = {Meng, Yuanliang  and  Rumshisky, Anna  and  Romanov, Alexey},
  title     = {Temporal Information Extraction for Question Answering Using Syntactic Dependencies in an LSTM-based Architecture},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {887--896},
  abstract  = {In this paper, we propose to use a set of simple, uniform in architecture
	LSTM-based models to recover different kinds of temporal relations from text.
	Using the shortest dependency path between entities as input, the same
	architecture is used to extract intra-sentence, cross-sentence, and document
	creation time relations. A ``double-checking'' technique reverses entity pairs
	in classification, boosting the recall of positive cases and reducing
	misclassifications between opposite classes. An efficient pruning algorithm
	resolves conflicts globally. Evaluated on QA-TempEval (SemEval2015 Task 5), our
	proposed technique outperforms state-of-the-art methods by a large margin. We
	also conduct intrinsic evaluation and post state-of-the-art results on
	Timebank-Dense.},
  url       = {https://www.aclweb.org/anthology/D17-1092}
}

@InProceedings{tymoshenko-bonadiman-moschitti:2017:EMNLP2017,
  author    = {Tymoshenko, Kateryna  and  Bonadiman, Daniele  and  Moschitti, Alessandro},
  title     = {Ranking Kernels for Structures and Embeddings: A Hybrid Preference and Classification Model},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {897--902},
  abstract  = {Recent work has shown that Tree Kernels (TKs) and Convolutional Neural Networks
	(CNNs) obtain the state of the art in answer sentence reranking. Additionally,
	their combination used in Support Vector Machines (SVMs) is promising as it can
	exploit both the syntactic patterns captured by TKs and the embeddings learned
	by CNNs. However, the embeddings are constructed according to a classification
	function, which is not directly exploitable in the preference ranking algorithm
	of SVMs. In this work, we propose a new hybrid approach combining preference
	ranking applied to TKs and pointwise ranking applied to CNNs. We show that our
	approach produces better results on two well-known and rather different
	datasets: WikiQA for answer sentence selection and SemEval cQA for comment
	selection in Community Question Answering.},
  url       = {https://www.aclweb.org/anthology/D17-1093}
}

@InProceedings{yavuz-EtAl:2017:EMNLP2017,
  author    = {Yavuz, Semih  and  Gur, Izzeddin  and  Su, Yu  and  Yan, Xifeng},
  title     = {Recovering Question Answering Errors via Query Revision},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {903--909},
  abstract  = {The existing factoid QA systems often
	lack a post-inspection component that can
	help models recover from their own mistakes.
	In this work, we propose to crosscheck
	the corresponding KB relations behind
	the predicted answers and identify
	potential inconsistencies. Instead of developing
	a new model that accepts evidences
	collected from these relations, we choose
	to plug them back to the original questions
	directly and check if the revised question
	makes sense or not. A bidirectional LSTM
	is applied to encode revised questions. We
	develop a scoring mechanism over the revised
	question encodings to refine the predictions
	of a base QA system. This approach
	can improve the F1 score of STAGG
	(Yih et al., 2015), one of the leading QA
	systems, from 52.5% to 53.9% on WEBQUESTIONS
	data.},
  url       = {https://www.aclweb.org/anthology/D17-1094}
}

@InProceedings{delbrouck-dupont:2017:EMNLP2017,
  author    = {Delbrouck, Jean-Benoit  and  Dupont, St\'{e}phane},
  title     = {An empirical study on the effectiveness of images in Multimodal Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {910--919},
  abstract  = {In state-of-the-art Neural Machine Trans-
	lation (NMT), an attention mechanism is
	used during decoding to enhance the trans-
	lation. At every step, the decoder uses this
	mechanism to focus on different parts of
	the source sentence to gather the most use-
	ful information before outputting its tar-
	get word. Recently, the effectiveness of
	the attention mechanism has also been ex-
	plored for multi-modal tasks, where it be-
	comes possible to focus both on sentence
	parts and image regions that they describe.
	In this paper, we compare several atten-
	tion mechanism on the multi-modal trans-
	lation task (English, image → German)
	and evaluate the ability of the model to
	make use of images to improve translation.
	We surpass state-of-the-art scores on the
	Multi30k data set, we nevertheless iden-
	tify and report different misbehavior of the
	machine while translating.},
  url       = {https://www.aclweb.org/anthology/D17-1095}
}

@InProceedings{vijayakumar-vedantam-parikh:2017:EMNLP2017,
  author    = {Vijayakumar, Ashwin  and  Vedantam, Ramakrishna  and  Parikh, Devi},
  title     = {Sound-Word2Vec: Learning Word Representations Grounded in Sounds},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {920--925},
  abstract  = {To be able to interact better with humans, it is crucial for machines to
	understand sound -- a primary modality of human perception. Previous works
	have used sound to learn embeddings for improved generic semantic similarity
	assessment. In this work, we treat sound as a first-class citizen, studying
	downstream 6textual tasks which require aural grounding. To this end, we
	propose sound-word2vec -- a new embedding scheme that learns specialized word
	embeddings grounded in sounds. For example, we learn that two seemingly (se-
	mantically) unrelated concepts, like leaves and paper are similar due to the
	similar rustling sounds they make. Our embed- dings prove useful in textual
	tasks requiring aural reasoning like text-based sound retrieval and discovering
	Foley sound effects (used in movies). Moreover, our em- bedding space captures
	interesting dependencies between words and onomatopoeia and outperforms prior
	work on aurally- relevant word relatedness datasets such as AMEN and ASLex.},
  url       = {https://www.aclweb.org/anthology/D17-1096}
}

@InProceedings{mahendru-EtAl:2017:EMNLP2017,
  author    = {Mahendru, Aroma  and  Prabhu, Viraj  and  Mohapatra, Akrit  and  Batra, Dhruv  and  Lee, Stefan},
  title     = {The Promise of Premise: Harnessing Question Premises in Visual Question Answering},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {926--935},
  abstract  = {In this paper, we make a simple observation that questions about images often
	contain premises -- objects and relationships implied by the question -- and
	that reasoning about premises can help Visual Question Answering (VQA) models
	respond more intelligently to irrelevant or previously unseen questions.
	When presented with a question that is irrelevant to an image, state-of-the-art
	VQA models will still answer purely based on learned language biases, resulting
	in non-sensical or even misleading answers. We note that a visual question is
	irrelevant to an image if at least one of its premises is false (i.e. not
	depicted in the image). We leverage this observation to construct a dataset for
	Question Relevance Prediction and Explanation (QRPE) by searching for false
	premises. We train novel question relevance detection models and show that
	models that reason about premises consistently outperform models that do not.
	We also find that forcing standard VQA models to reason about premises during
	training can lead to improvements on tasks requiring compositional reasoning.},
  url       = {https://www.aclweb.org/anthology/D17-1097}
}

@InProceedings{anderson-EtAl:2017:EMNLP2017,
  author    = {Anderson, Peter  and  Fernando, Basura  and  Johnson, Mark  and  Gould, Stephen},
  title     = {Guided Open Vocabulary Image Captioning with Constrained Beam Search},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {936--945},
  abstract  = {Existing image captioning models do not generalize well to out-of-domain images
	containing novel scenes or objects. This limitation severely hinders the use of
	these models in real world applications dealing with images in the wild. We
	address this problem using a flexible approach that enables existing deep
	captioning architectures to take advantage of image taggers at test time,
	without re-training. Our method uses constrained beam search to force the
	inclusion of selected tag words in the output, and fixed, pretrained word
	embeddings to facilitate vocabulary expansion to previously unseen tag words.
	Using this approach we achieve state of the art results for out-of-domain
	captioning on MSCOCO (and improved results for in-domain captioning). Perhaps
	surprisingly, our results significantly outperform approaches that incorporate
	the same tag predictions into the learning algorithm. We also show that we can
	significantly improve the quality of generated ImageNet captions by leveraging
	ground-truth labels.},
  url       = {https://www.aclweb.org/anthology/D17-1098}
}

@InProceedings{zellers-choi:2017:EMNLP2017,
  author    = {Zellers, Rowan  and  Choi, Yejin},
  title     = {Zero-Shot Activity Recognition with Verb Attribute Induction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {946--958},
  abstract  = {In this paper, we investigate large-scale zero-shot activity recognition by
	modeling the visual and linguistic attributes of action verbs. For example, the
	verb ``salute'' has several properties, such as being a light movement, a
	social act, and short in duration. We use these attributes as the internal
	mapping between visual and textual representations to reason about a previously
	unseen action. In contrast to much prior work that assumes access to gold
	standard attributes for zero-shot classes and focuses primarily on object
	attributes, our model uniquely learns to infer action attributes from
	dictionary definitions and distributed word representations. Experimental
	results confirm that action attributes inferred from language can provide a
	predictive signal for zero-shot prediction of previously unseen activities.},
  url       = {https://www.aclweb.org/anthology/D17-1099}
}

@InProceedings{zarriess-schlangen:2017:EMNLP2017,
  author    = {Zarrie{\ss}, Sina  and  Schlangen, David},
  title     = {Deriving continous grounded meaning representations from referentially structured multimodal contexts},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {959--965},
  abstract  = {Corpora of referring expressions paired with their visual referents are a good
	source for learning word meanings directly grounded in visual representations.
	Here, we explore additional ways of extracting from them word representations
	linked to  multi-modal context: through expressions that refer to the same
	object, and through expressions that refer to different objects in the same
	scene. We show that continuous meaning representations derived from these
	contexts capture complementary aspects of similarity, , even if not
	outperforming textual embeddings trained on very large amounts of raw text when
	tested on standard similarity benchmarks. We propose a new task for evaluating
	grounded meaning representations---detection of potentially co-referential
	phrases---and show that it requires precise denotational representations of
	attribute meanings, which our method provides.},
  url       = {https://www.aclweb.org/anthology/D17-1100}
}

@InProceedings{yu-bansal-berg:2017:EMNLP2017,
  author    = {Yu, Licheng  and  Bansal, Mohit  and  Berg, Tamara},
  title     = {Hierarchically-Attentive RNN for Album Summarization and Storytelling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {966--971},
  abstract  = {We address the problem of end-to-end visual storytelling. 
	Given a photo album, our model first selects the most representative (summary)
	photos, and then composes a natural language story for the album.
	For this task, we make use of the Visual Storytelling dataset and a model
	composed of three hierarchically-attentive Recurrent Neural Nets (RNNs) to:
	encode the album photos, select representative (summary) photos, and  compose
	the story. Automatic and human evaluations show our model achieves better
	performance on
	selection, generation, and retrieval than baselines.},
  url       = {https://www.aclweb.org/anthology/D17-1101}
}

@InProceedings{fu-EtAl:2017:EMNLP2017,
  author    = {Fu, Cheng-Yang  and  Lee, Joon  and  Bansal, Mohit  and  Berg, Alexander},
  title     = {Video Highlight Prediction Using Audience Chat Reactions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {972--978},
  abstract  = {Sports channel video portals offer an exciting domain for research on
	multimodal, multilingual analysis. We present methods addressing the problem of
	automatic video highlight prediction based on joint visual features and textual
	analysis of the real-world audience discourse with complex slang, in both
	English and traditional Chinese. We present a novel dataset based on League of
	Legends championships recorded from North American and Taiwanese Twitch.tv
	channels (will be released for further research), and demonstrate strong
	results on these using multimodal, character-level CNN-RNN model architectures.},
  url       = {https://www.aclweb.org/anthology/D17-1102}
}

@InProceedings{pasunuru-bansal:2017:EMNLP2017,
  author    = {Pasunuru, Ramakanth  and  Bansal, Mohit},
  title     = {Reinforced Video Captioning with Entailment Rewards},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {979--985},
  abstract  = {Sequence-to-sequence models have shown promising  improvements on the temporal
	task of video captioning, but they optimize word-level cross-entropy loss
	during training. First, using policy gradient and mixed-loss methods for
	reinforcement learning,  we directly optimize sentence-level task-based metrics
	(as rewards), achieving significant improvements over the baseline, based on
	both automatic metrics and human evaluation on multiple datasets. Next, we
	propose a novel entailment-enhanced reward (CIDEnt) that corrects
	phrase-matching based metrics (such as CIDEr) to only allow for
	logically-implied partial matches and avoid contradictions, achieving further
	significant improvements over the CIDEr-reward model. Overall, our
	CIDEnt-reward model achieves the new state-of-the-art on the MSR-VTT dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1103}
}

@InProceedings{mu-hartshorne-odonnell:2017:EMNLP2017,
  author    = {Mu, Jesse  and  Hartshorne, Joshua K.  and  O'Donnell, Timothy},
  title     = {Evaluating Hierarchies of Verb Argument Structure with Hierarchical Clustering},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {986--991},
  abstract  = {Verbs can only be used with a few specific arrangements of their arguments
	(syntactic frames). Most theorists note that verbs can be organized into a
	hierarchy of verb classes based on the frames they admit. Here we show that
	such a hierarchy is objectively well-supported by the patterns of verbs and
	frames in English, since a systematic hierarchical clustering algorithm
	converges on the same structure as the handcrafted taxonomy of VerbNet, a
	broad-coverage verb lexicon. We also show that the hierarchies capture
	meaningful psychological dimensions of generalization by predicting novel verb
	coercions by human participants. We discuss limitations of a simple
	hierarchical representation and suggest similar approaches for identifying the
	representations underpinning verb argument structure.},
  url       = {https://www.aclweb.org/anthology/D17-1104}
}

@InProceedings{calixto-liu:2017:EMNLP2017,
  author    = {Calixto, Iacer  and  Liu, Qun},
  title     = {Incorporating Global Visual Features into Attention-based Neural Machine Translation.},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {992--1003},
  abstract  = {We introduce multi-modal, attention-based neural machine translation (NMT)
	models which incorporate visual features into different parts of both the
	encoder and the decoder. Global image features are extracted using a
	pre-trained convolutional neural network and are incorporated (i) as words in
	the source sentence, (ii) to initialise the encoder hidden state, and (iii) as
	additional data to initialise the decoder hidden state. In our experiments, we
	evaluate translations into English and German, how different strategies to
	incorporate global image features compare and which ones perform best. We also
	study the impact that adding synthetic multi-modal, multilingual data brings
	and find that the additional data have a positive impact on multi-modal NMT
	models. We report new state-of-the-art results and our best models also
	significantly improve on a comparable phrase-based Statistical MT (PBSMT) model
	trained on the Multi30k data set according to all metrics evaluated. To the
	best of our knowledge, it is the first time a purely neural model significantly
	improves over a PBSMT model on all metrics evaluated on this data set.},
  url       = {https://www.aclweb.org/anthology/D17-1105}
}

@InProceedings{misra-langford-artzi:2017:EMNLP2017,
  author    = {Misra, Dipendra  and  Langford, John  and  Artzi, Yoav},
  title     = {Mapping Instructions and Visual Observations to Actions with Reinforcement Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1004--1015},
  abstract  = {We propose to directly map raw visual observations and text input to actions
	for instruction execution. While existing approaches assume access to
	structured environment representations or use a pipeline of separately trained
	models, we learn a single model to jointly reason about linguistic and visual
	input. We use reinforcement learning in a contextual bandit setting to train a
	neural network agent. To guide the agent's exploration, we use reward shaping
	with different forms of supervision. Our approach does not require intermediate
	representations, planning procedures, or training different models. We evaluate
	in a simulated environment, and show significant improvements over supervised
	learning and common reinforcement learning variants.},
  url       = {https://www.aclweb.org/anthology/D17-1106}
}

@InProceedings{fraser-EtAl:2017:EMNLP2017,
  author    = {Fraser, Kathleen C.  and  Lundholm Fors, Kristina  and  Kokkinakis, Dimitrios  and  Nordlund, Arto},
  title     = {An analysis of eye-movements during reading for the detection of mild cognitive impairment},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1016--1026},
  abstract  = {We present a machine learning analysis of eye-tracking data for the detection
	of mild cognitive impairment, a decline in cognitive abilities that is
	associated with an increased risk of developing dementia. We compare two
	experimental configurations (reading aloud versus reading silently), as well as
	two methods of combining information from the two trials (concatenation and
	merging). Additionally, we annotate the words being read with information about
	their frequency and syntactic category, and use these annotations to generate
	new features. Ultimately, we are able to distinguish between participants with
	and without cognitive impairment with up to 86% accuracy.},
  url       = {https://www.aclweb.org/anthology/D17-1107}
}

@InProceedings{ning-feng-roth:2017:EMNLP2017,
  author    = {Ning, Qiang  and  Feng, Zhili  and  Roth, Dan},
  title     = {A Structured Learning Approach to Temporal Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1027--1037},
  abstract  = {Identifying temporal relations between events is an essential step towards
	natural language understanding. However, the temporal relation between two
	events in a story depends on, and is often dictated by, relations among other
	events. Consequently, effectively identifying temporal relations between events
	is a challenging problem even for human annotators. This paper suggests that it
	is important to take these dependencies into account while learning to identify
	these relations and proposes a structured learning approach to address this
	challenge. As a byproduct, this provides a new perspective on handling missing
	relations, a known issue that hurts existing methods. As we show, the proposed
	approach results in significant improvements on the two commonly used data sets
	for this problem.},
  url       = {https://www.aclweb.org/anthology/D17-1108}
}

@InProceedings{chaganty-EtAl:2017:EMNLP2017,
  author    = {Chaganty, Arun  and  Paranjape, Ashwin  and  Liang, Percy  and  Manning, Christopher D.},
  title     = {Importance sampling for unbiased on-demand evaluation of knowledge base population},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1038--1048},
  abstract  = {Knowledge base population (KBP) systems take in a large document corpus and
	extract entities and their relations. Thus far, KBP evaluation has relied on
	judgements on the pooled predictions of existing systems.
	We show that this evaluation is problematic: when a new system predicts a
	previously unseen relation, it is penalized even if it is correct. This leads
	to significant bias against new systems, which counterproductively discourages
	innovation in the field. Our first contribution is a new importance-sampling
	based evaluation which corrects for this bias by annotating a new system's
	predictions on-demand via crowdsourcing. We show this eliminates bias and
	reduces variance using data from the 2015 TAC KBP task. Our second contribution
	is an implementation of our method made publicly available as an online KBP
	evaluation service. We pilot the service by testing diverse state-of-the-art
	systems on the TAC KBP 2016 corpus and obtain accurate scores in a cost
	effective manner.},
  url       = {https://www.aclweb.org/anthology/D17-1109}
}

@InProceedings{hui-EtAl:2017:EMNLP2017,
  author    = {Hui, Kai  and  Yates, Andrew  and  Berberich, Klaus  and  de Melo, Gerard},
  title     = {PACRR: A Position-Aware Neural IR Model for Relevance Matching},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1049--1058},
  abstract  = {In order to adopt deep learning for information retrieval, models are needed
	that can capture all relevant information required to assess the relevance of a
	document to a given
	user query. While previous works have successfully captured unigram term
	matches, how to fully employ position-dependent information such as proximity
	and term dependencies has been insufficiently explored. In this work, we
	propose a novel neural IR model named PACRR
	aiming at better modeling position-dependent interactions between a query and a
	document.
	Extensive experiments on six years' TREC Web Track data confirm that the
	proposed model yields better results under multiple benchmarks.},
  url       = {https://www.aclweb.org/anthology/D17-1110}
}

@InProceedings{raiman-miller:2017:EMNLP2017,
  author    = {Raiman, Jonathan  and  Miller, John},
  title     = {Globally Normalized Reader},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1059--1069},
  abstract  = {Rapid progress has been made towards question answering (QA) systems that can
	extract answers from text. Existing neural approaches make use of expensive
	bi-directional attention mechanisms or score all possible answer spans,
	limiting scalability. We propose instead to cast extractive QA as an iterative
	search problem: select the answer's sentence, start word, and end word. This
	representation reduces the space of each search step and allows computation to
	be conditionally allocated to promising search paths. We show that globally
	normalizing the decision process and back-propagating through beam search makes
	this representation viable and learning efficient. We empirically demonstrate
	the benefits of this approach using our model, Globally Normalized Reader
	(GNR), which achieves the second highest single model performance on the
	Stanford Question Answering Dataset (68.4 EM, 76.21 F1 dev) and is 24.7x faster
	than bi-attention-flow. We also introduce a data-augmentation method to produce
	semantically valid examples by aligning named entities to a knowledge base and
	swapping them with new entities of the same type. This method  improves the
	performance of all models considered in this work and is of independent
	interest for a variety of NLP tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1111}
}

@InProceedings{elsner-shain:2017:EMNLP2017,
  author    = {Elsner, Micha  and  Shain, Cory},
  title     = {Speech segmentation with a neural encoder model of working memory},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1070--1080},
  abstract  = {We present the first unsupervised LSTM speech segmenter as a cognitive model of
	the acquisition of words from unsegmented input. Cognitive biases toward
	phonological and syntactic predictability in speech are rooted in the
	limitations of human memory (Baddeley et al., 1998); compressed representations
	are easier to acquire and retain in memory. To model the biases introduced by
	these memory limitations, our system uses an LSTM-based encoder-decoder with a
	small number of hidden units, then searches for a segmentation that minimizes
	autoencoding loss. Linguistically meaningful segments (e.g. words) should share
	regular patterns of features that facilitate decoder performance in comparison
	to random segmentations, and we show that our learner discovers these patterns
	when trained on either phoneme sequences or raw acoustics. To our knowledge,
	ours is the first fully unsupervised system to be able to segment both symbolic
	and acoustic representations of speech.},
  url       = {https://www.aclweb.org/anthology/D17-1112}
}

@InProceedings{bulat-clark-shutova:2017:EMNLP2017,
  author    = {Bulat, Luana  and  Clark, Stephen  and  Shutova, Ekaterina},
  title     = {Speaking, Seeing, Understanding: Correlating semantic models with conceptual representation in the brain},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1081--1091},
  abstract  = {Research in computational semantics is increasingly guided by our understanding
	of human semantic processing. However, semantic models are typically studied in
	the context of natural language processing system performance. In this paper,
	we present a systematic evaluation and comparison of a range of widely-used,
	state-of-the-art semantic models in their ability to predict patterns of
	conceptual representation in the human brain. Our results provide new insights
	both for the design of computational semantic models and for further research
	in cognitive neuroscience.},
  url       = {https://www.aclweb.org/anthology/D17-1113}
}

@InProceedings{li-EtAl:2017:EMNLP20171,
  author    = {Li, Haoran  and  Zhu, Junnan  and  Ma, Cong  and  Zhang, Jiajun  and  Zong, Chengqing},
  title     = {Multi-modal Summarization for Asynchronous Collection of Text, Image, Audio and Video},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1092--1102},
  abstract  = {The rapid increase of the multimedia data over the Internet necessitates
	multi-modal summarization from collections of text, image, audio and video.  
	In this work, we propose an extractive Multi-modal Summarization (MMS) method
	which can automatically generate a textual summary given a set of documents,
	images, audios and videos related to a specific topic. The key idea is to
	bridge the semantic gaps between multi-modal contents. For audio information,
	we design an approach to selectively use its transcription. For vision
	information, we learn joint representations of texts and images using a neural
	network. Finally, all the multi-modal aspects are considered to generate the
	textural summary by maximizing the salience, non-redundancy, readability and
	coverage through budgeted optimization of submodular functions.  We further
	introduce an MMS corpus in English and Chinese. The experimental results on
	this dataset demonstrate that our
	method outperforms other competitive baseline methods.},
  url       = {https://www.aclweb.org/anthology/D17-1114}
}

@InProceedings{zadeh-EtAl:2017:EMNLP2017,
  author    = {Zadeh, Amir  and  Chen, Minghai  and  Poria, Soujanya  and  Cambria, Erik  and  Morency, Louis-Philippe},
  title     = {Tensor Fusion Network for Multimodal Sentiment Analysis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1103--1114},
  abstract  = {Multimodal sentiment analysis is an increasingly popular research area, which
	extends the conventional language-based definition of sentiment analysis to a
	multimodal setup where other relevant modalities accompany language. In this
	paper, we pose the problem of multimodal sentiment analysis as modeling
	intra-modality and inter-modality dynamics. We introduce a novel model, termed
	Tensor Fusion Networks, which learns both such dynamics end-to-end. The
	proposed approach is tailored for the volatile nature of spoken language in
	online videos as well as accompanying gestures and voice. In the experiments,
	our model outperforms state-of-the-art approaches for both multimodal and
	unimodal sentiment analysis.},
  url       = {https://www.aclweb.org/anthology/D17-1115}
}

@InProceedings{joseph-EtAl:2017:EMNLP2017,
  author    = {Joseph, Kenneth  and  Friedland, Lisa  and  Hobbs, William  and  Lazer, David  and  Tsur, Oren},
  title     = {ConStance: Modeling Annotation Contexts to Improve Stance Classification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1115--1124},
  abstract  = {Manual annotations are a prerequisite for many applications of machine
	learning.
	However, weaknesses in the annotation process itself are easy to overlook. In
	particular, scholars often choose what information to give to annotators
	without examining these decisions empirically.
	For subjective tasks such as sentiment analysis, sarcasm, and stance detection,
	such choices can impact results. 
	Here, for the task of political stance detection on Twitter, we show that
	providing
	too little context can result in noisy and uncertain annotations, 
	whereas providing too strong a context may cause it to outweigh other signals.
	To characterize and reduce these biases, we develop ConStance, a general model
	for reasoning about annotations across information 
	conditions. 
	Given conflicting labels produced by multiple annotators seeing the same
	instances with different contexts, ConStance simultaneously 
	estimates gold standard labels and also learns a classifier for new instances.
	We show that the classifier learned by ConStance outperforms 
	a variety of baselines at predicting political stance, while the model's
	interpretable parameters shed light on the effects of each context.},
  url       = {https://www.aclweb.org/anthology/D17-1116}
}

@InProceedings{pavlopoulos-malakasiotis-androutsopoulos:2017:EMNLP2017,
  author    = {Pavlopoulos, John  and  Malakasiotis, Prodromos  and  Androutsopoulos, Ion},
  title     = {Deeper Attention to Abusive User Content Moderation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1125--1135},
  abstract  = {Experimenting with a new dataset of 1.6M
	user comments from a news portal and an
	existing dataset of 115K Wikipedia talk
	page comments, we show that an RNN operating
	on word embeddings outpeforms
	the previous state of the art in moderation,
	which used logistic regression or an MLP
	classifier with character or word n-grams.
	We also compare against a CNN operating
	on word embeddings, and a word-list
	baseline. A novel, deep, classificationspecific
	attention mechanism improves the
	performance of the RNN further, and can
	also highlight suspicious words for free,
	without including highlighted words in the
	training data. We consider both fully automatic
	and semi-automatic moderation.},
  url       = {https://www.aclweb.org/anthology/D17-1117}
}

@InProceedings{dubossarsky-weinshall-grossman:2017:EMNLP2017,
  author    = {Dubossarsky, Haim  and  Weinshall, Daphna  and  Grossman, Eitan},
  title     = {Outta Control: Laws of Semantic Change and Inherent Biases in Word Representation Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1136--1145},
  abstract  = {This article evaluates three proposed laws of semantic change. Our claim is
	that in order to validate a putative law of semantic change, the effect should
	be observed in the genuine condition but absent or reduced in a suitably
	matched control condition, in which no change can possibly have taken place.
	Our analysis shows that the effects reported in recent literature must be
	substantially revised: (i) the proposed negative correlation between meaning
	change and word frequency is shown to be largely an artefact of the models of
	word representation used; (ii) the proposed negative correlation between
	meaning change and prototypicality is shown to be much weaker than what has
	been claimed in prior art; and (iii) the proposed positive correlation between
	meaning change and polysemy is largely an artefact of word frequency. These
	empirical observations are corroborated by analytical proofs that show that
	count representations introduce an inherent dependence on word frequency, and
	thus word frequency cannot be evaluated as an independent factor with these
	representations.},
  url       = {https://www.aclweb.org/anthology/D17-1118}
}

@InProceedings{lynn-EtAl:2017:EMNLP2017,
  author    = {Lynn, Veronica  and  Son, Youngseo  and  Kulkarni, Vivek  and  Balasubramanian, Niranjan  and  Schwartz, H. Andrew},
  title     = {Human Centered NLP with User-Factor Adaptation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1146--1155},
  abstract  = {We pose the general task of user-factor adaptation -- adapting supervised
	learning models to real-valued user factors inferred from a background of their
	language, reflecting the idea that a piece of text should be understood within
	the context of the user that wrote it. We introduce a continuous adaptation
	technique, suited for real-valued user factors that are common in social
	science and bringing us closer to personalized NLP, adapting to each user
	uniquely. We apply this technique with known user factors including age,
	gender, and personality traits, as well as latent factors, evaluating over five
	tasks: POS tagging, PP-attachment, sentiment analysis, sarcasm detection, and
	stance detection. Adaptation provides statistically significant benefits for 3
	of the 5 tasks: up to +1.2 points for PP-attachment, +3.4 points for sarcasm,
	and +3.0 points for stance.},
  url       = {https://www.aclweb.org/anthology/D17-1119}
}

@InProceedings{raganato-dellibovi-navigli:2017:EMNLP2017,
  author    = {Raganato, Alessandro  and  Delli Bovi, Claudio  and  Navigli, Roberto},
  title     = {Neural Sequence Learning Models for Word Sense Disambiguation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1156--1167},
  abstract  = {Word Sense Disambiguation models exist in many flavors. Even though supervised
	ones tend to perform best in terms of accuracy, they often lose ground to more
	flexible knowledge-based solutions, which do not require training by a word
	expert for every disambiguation target. To bridge this gap we adopt a different
	perspective and rely on sequence learning to frame the disambiguation problem:
	we propose and study in depth a series of end-to-end neural architectures
	directly tailored to the task, from bidirectional Long Short-Term Memory to
	encoder-decoder models. Our extensive evaluation over standard benchmarks and
	in multiple languages shows that sequence learning enables more versatile
	all-words models that consistently lead to state-of-the-art results, even
	against word experts with engineered features.},
  url       = {https://www.aclweb.org/anthology/D17-1120}
}

@InProceedings{rosin-adar-radinsky:2017:EMNLP2017,
  author    = {Rosin, Guy D.  and  Adar, Eytan  and  Radinsky, Kira},
  title     = {Learning Word Relatedness over Time},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1168--1178},
  abstract  = {Search systems are often focused on providing relevant results for the "now",
	assuming both corpora and user needs that focus on the present. However, many
	corpora today reflect significant longitudinal collections ranging from 20
	years of the Web to hundreds of years of digitized newspapers and books.
	Understanding the temporal intent of the user and retrieving the most relevant
	historical content has become a significant challenge. Common search features,
	such as query expansion, leverage the relationship between terms but cannot
	function well across all times when relationships vary temporally. In this
	work, we introduce a temporal relationship model that is extracted from
	longitudinal data collections. The model supports the task of identifying,
	given two words, when they relate to each other. We present an algorithmic
	framework for this task and show its application for the task of query
	expansion, achieving high gain.
	Author{3}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1121}
}

@InProceedings{shen-yang-deng:2017:EMNLP2017,
  author    = {Shen, Gehui  and  Yang, Yunlun  and  Deng, Zhi-Hong},
  title     = {Inter-Weighted Alignment Network for Sentence Pair Modeling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1179--1189},
  abstract  = {Sentence pair modeling is a crucial problem in the field of natural language
	processing. 
	In this paper, we propose a model to measure the similarity of a sentence pair
	focusing on the interaction information. We utilize the word level similarity
	matrix to discover fine-grained alignment of two sentences. It should be
	emphasized that each word in a sentence has a different importance from the
	perspective of semantic composition, so we exploit two novel and efficient
	strategies to explicitly calculate a weight for each word. Although the
	proposed model only use a sequential LSTM for sentence modeling without any
	external resource such as syntactic parser tree and additional lexicon
	features, experimental results show that our model achieves state-of-the-art
	performance on three datasets of two tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1122}
}

@InProceedings{wang-he-zhou:2017:EMNLP2017,
  author    = {Wang, Chengyu  and  He, Xiaofeng  and  Zhou, Aoying},
  title     = {A Short Survey on Taxonomy Learning from Text Corpora: Issues, Resources and Recent Advances},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1190--1203},
  abstract  = {A taxonomy is a semantic hierarchy, consisting of concepts linked by is-a
	relations. While a large number of taxonomies have been constructed from
	human-compiled resources (e.g., Wikipedia), learning taxonomies from text
	corpora has received a growing interest and is essential for long-tailed and
	domain-specific knowledge acquisition. In this paper, we overview recent
	advances on taxonomy construction from free texts, reorganizing relevant
	subtasks into a complete framework. We also overview resources for evaluation
	and discuss challenges for future research.},
  url       = {https://www.aclweb.org/anthology/D17-1123}
}

@InProceedings{liu-EtAl:2017:EMNLP20173,
  author    = {Liu, Pengfei  and  Qian, Kaiyu  and  Qiu, Xipeng  and  Huang, Xuanjing},
  title     = {Idiom-Aware Compositional Distributed Semantics},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1204--1213},
  abstract  = {Idioms are peculiar linguistic constructions that impose great challenges for
	representing the semantics of language, especially in current prevailing
	end-to-end neural models, which assume that the semantics of a phrase or
	sentence can be literally composed from its constitutive words.
	In this paper, we propose an idiom-aware distributed semantic model to build
	representation of sentences on the basis of understanding their contained
	idioms. Our models are grounded in the literal-first psycholinguistic
	hypothesis, which can adaptively learn semantic compositionality of a phrase
	literally or idiomatically. To better evaluate our models, we also construct an
	idiom-enriched sentiment classification dataset with considerable scale and
	abundant peculiarities of idioms. The qualitative and quantitative experimental
	analyses demonstrate the efficacy of our models.
	Author{1}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1124}
}

@InProceedings{zhang-pasupat-liang:2017:EMNLP2017,
  author    = {Zhang, Yuchen  and  Pasupat, Panupong  and  Liang, Percy},
  title     = {Macro Grammars and Holistic Triggering for Efficient Semantic Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1214--1223},
  abstract  = {To learn a semantic parser from denotations, a learning algorithm must search
	over a combinatorially large space of logical forms for ones consistent with
	the annotated denotations. We propose a new online learning algorithm that
	searches faster as training progresses. The two key ideas are using macro
	grammars to cache the abstract patterns of useful logical forms found thus far,
	and holistic triggering to efficiently retrieve the most relevant patterns
	based on sentence similarity. On the WikiTableQuestions dataset, we first
	expand the search space of an existing model to improve the state-of-the-art
	accuracy from 38.7% to 42.7%, and then use macro grammars and holistic
	triggering to achieve an 11x speedup and an accuracy of 43.7%.},
  url       = {https://www.aclweb.org/anthology/D17-1125}
}

@InProceedings{lan-EtAl:2017:EMNLP20171,
  author    = {Lan, Wuwei  and  Qiu, Siyu  and  He, Hua  and  Xu, Wei},
  title     = {A Continuously Growing Dataset of Sentential Paraphrases},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1224--1234},
  abstract  = {A major challenge in paraphrase research is the lack of parallel corpora. In
	this paper, we present a new method to collect large-scale sentential
	paraphrases from Twitter by linking tweets through shared URLs. The main
	advantage of our method is its simplicity, as it gets rid of the classifier or
	human in the loop needed to select data before annotation and subsequent
	application of paraphrase identification algorithms in the previous work.
	We present the largest human-labeled paraphrase corpus to date of 51,524
	sentence pairs and the first cross-domain benchmarking for automatic paraphrase
	identification. In addition, we show that more than 30,000 new sentential
	paraphrases can be easily and continuously captured every month at ~70\%
	precision, and demonstrate their utility for downstream NLP tasks through
	phrasal paraphrase extraction. We make our code and data freely available.},
  url       = {https://www.aclweb.org/anthology/D17-1126}
}

@InProceedings{su-yan:2017:EMNLP2017,
  author    = {Su, Yu  and  Yan, Xifeng},
  title     = {Cross-domain Semantic Parsing via Paraphrasing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1235--1246},
  abstract  = {Existing studies on semantic parsing mainly focus on the in-domain setting. We
	formulate cross-domain semantic parsing as a domain adaptation problem: train a
	semantic parser on some source domains and then adapt it to the target domain.
	Due to the diversity of logical forms in different domains, this problem
	presents unique and intriguing challenges. By converting logical forms into
	canonical utterances in natural language, we reduce semantic parsing to
	paraphrasing, and develop an attentive sequence-to-sequence paraphrase model
	that is general and flexible to adapt to different domains. We discover two
	problems, small micro variance and large macro variance, of pre-trained word
	embeddings that hinder their direct use in neural networks, and propose
	standardization techniques as a remedy. On the popular Overnight dataset, which
	contains eight domains, we show that both cross-domain training and
	standardized pre-trained word embeddings can bring significant improvement.},
  url       = {https://www.aclweb.org/anthology/D17-1127}
}

@InProceedings{yang-mitchell:2017:EMNLP2017,
  author    = {Yang, Bishan  and  Mitchell, Tom},
  title     = {A Joint Sequential and Relational Model for Frame-Semantic Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1247--1256},
  abstract  = {We introduce a new method for frame-semantic parsing that significantly
	improves the prior state of the art. Our model leverages the advantages of a
	deep bidirectional LSTM network which predicts semantic role labels word by
	word and a relational network which predicts semantic roles for individual text
	expressions in relation to a predicate. The two networks are integrated into a
	single model via knowledge distillation, and a unified graphical model is
	employed to jointly decode frames and semantic roles during inference.
	Experiments on the standard FrameNet data show that our model significantly
	outperforms existing neural and non-neural approaches, achieving a 5.7 F1 gain
	over the current state of the art, for full frame structure extraction.},
  url       = {https://www.aclweb.org/anthology/D17-1128}
}

@InProceedings{wang-xue:2017:EMNLP2017,
  author    = {Wang, Chuan  and  Xue, Nianwen},
  title     = {Getting the Most out of AMR Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1257--1268},
  abstract  = {This paper proposes to tackle the AMR parsing bottleneck by improving two
	components of an AMR parser: concept identification and alignment. We first
	build a Bidirectional LSTM based concept identifier that is able to incorporate
	richer contextual information to learn sparse AMR concept labels. We then
	extend an HMM-based word-to-concept alignment model with graph distance
	distortion and a rescoring method during decoding to incorporate the structural
	information in the AMR graph. We show integrating the two components into an
	existing AMR parser results in consistently better performance over the state
	of the art on various datasets.},
  url       = {https://www.aclweb.org/anthology/D17-1129}
}

@InProceedings{ballesteros-alonaizan:2017:EMNLP2017,
  author    = {Ballesteros, Miguel  and  Al-Onaizan, Yaser},
  title     = {AMR Parsing using Stack-LSTMs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1269--1275},
  abstract  = {We present a transition-based AMR parser that directly generates AMR parses
	from plain text. 
	We use Stack-LSTMs to represent our parser state and make decisions greedily.
	In our experiments, we show that our parser achieves very competitive scores on
	English using only AMR training data. Adding additional information, such as
	POS tags and dependency trees, improves the results further.},
  url       = {https://www.aclweb.org/anthology/D17-1130}
}

@InProceedings{zhao-EtAl:2017:EMNLP20172,
  author    = {Zhao, Jie  and  Su, Yu  and  Guan, Ziyu  and  Sun, Huan},
  title     = {An End-to-End Deep Framework for Answer Triggering with a Novel Group-Level Objective},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1276--1282},
  abstract  = {Given a question and a set of answer candidates, answer triggering determines
	whether the candidate set contains any correct answers. If yes, it then outputs
	a correct one. In contrast to existing pipeline methods which first consider
	individual candidate answers separately and then make a prediction based on a
	threshold, we propose an end-to-end deep neural network framework, which is
	trained by a novel group-level objective function that directly optimizes the
	answer triggering performance. Our objective function penalizes three potential
	types of error and allows training the framework in an end-to-end manner.
	Experimental results on the WikiQA benchmark show that our framework
	outperforms the state of the arts by a 6.6% absolute gain under F1 measure.},
  url       = {https://www.aclweb.org/anthology/D17-1131}
}

@InProceedings{cattle-ma:2017:EMNLP2017,
  author    = {Cattle, Andrew  and  Ma, Xiaojuan},
  title     = {Predicting Word Association Strengths},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1283--1288},
  abstract  = {This paper looks at the task of predicting word association strengths across
	three datasets; WordNet Evocation (Boyd-Graber et al., 2006), University of
	Southern Florida Free Association norms (Nelson et al., 2004), and Edinburgh
	Associative Thesaurus (Kiss et al., 1973). We achieve results of r=0.357 and
	p=0.379, r=0.344 and p=0.300, and r=0.292 and p=0.363, respectively. We find
	Word2Vec (Mikolov et al., 2013) and GloVe (Pennington et al., 2014) cosine
	similarities, as well as vector offsets, to be the highest performing features.
	Furthermore, we examine the usefulness of Gaussian embeddings (Vilnis and
	McCallum, 2014) for predicting word association strength, the first work to do
	so.},
  url       = {https://www.aclweb.org/anthology/D17-1132}
}

@InProceedings{liu-lapata:2017:EMNLP2017,
  author    = {Liu, Yang  and  Lapata, Mirella},
  title     = {Learning Contextually Informed Representations for Linear-Time Discourse Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1289--1298},
  abstract  = {Recent advances in RST discourse parsing have focused on two modeling
	paradigms: (a) high order parsers which jointly predict the tree structure of
	the discourse and the relations it encodes; or                                (b)
	linear-time
	parsers
	which
	are efficient but mostly based on local features.  In this work, we propose a
	linear-time parser with a novel way of representing discourse constituents
	based on neural networks which takes into account global contextual information
	and is able to capture long-distance dependencies. Experimental results show
	that our parser obtains state-of-the art performance on benchmark datasets,
	while being efficient (with time complexity linear in the number of sentences
	in the document) and requiring minimal feature engineering.},
  url       = {https://www.aclweb.org/anthology/D17-1133}
}

@InProceedings{lan-EtAl:2017:EMNLP20172,
  author    = {Lan, Man  and  Wang, Jianxiang  and  Wu, Yuanbin  and  Niu, Zheng-Yu  and  Wang, Haifeng},
  title     = {Multi-task Attention-based Neural Networks for Implicit Discourse Relationship Representation and Identification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1299--1308},
  abstract  = {We present a novel multi-task attention based neural network model to address
	implicit discourse relationship representation and identification through two
	types of representation learning, an attention based neural network for
	learning discourse relationship representation with two arguments and a
	multi-task framework for learning knowledge from annotated and unannotated
	corpora. The extensive experiments have been performed on two benchmark corpora
	(i.e., PDTB and CoNLL-2016 datasets). Experimental results show that our
	proposed model outperforms the state-of-the-art systems on benchmark corpora.},
  url       = {https://www.aclweb.org/anthology/D17-1134}
}

@InProceedings{yin-EtAl:2017:EMNLP2017,
  author    = {Yin, Qingyu  and  Zhang, Yu  and  Zhang, Weinan  and  Liu, Ting},
  title     = {Chinese Zero Pronoun Resolution with Deep Memory Network},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1309--1318},
  abstract  = {Existing approaches for Chinese zero pronoun resolution typically utilize only
	syntactical and lexical features while ignoring semantic information. The
	fundamental reason is that zero pronouns have no descriptive information, which
	brings difficulty in explicitly capturing their semantic similarities with
	antecedents. Meanwhile, representing zero pronouns is challenging since they
	are merely gaps that convey no actual content. In this paper, we address this
	issue by building a deep memory network that is capable of encoding zero
	pronouns into vector representations with information obtained from their
	contexts and potential antecedents. Consequently, our resolver takes advantage
	of semantic information by using these continuous distributed representations.
	Experiments on the OntoNotes 5.0 dataset show that the proposed memory network
	could substantially outperform the state-of-the-art systems in various
	experimental settings.},
  url       = {https://www.aclweb.org/anthology/D17-1135}
}

@InProceedings{morey-muller-asher:2017:EMNLP2017,
  author    = {Morey, Mathieu  and  Muller, Philippe  and  Asher, Nicholas},
  title     = {How much progress have we made on RST discourse parsing? A replication study of recent results on the RST-DT},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1319--1324},
  abstract  = {This article evaluates purported progress over the past years in RST discourse
	parsing.
	Several studies report a relative error reduction of 24 to 51\% on all metrics
	that authors attribute to the introduction of distributed representations of
	discourse units.
	We replicate the standard evaluation of 9 parsers, 5 of which use distributed
	representations, from 8 studies published between 2013 and 2017, using their
	predictions on the test set of the RST-DT.
	Our main finding is that most recently reported increases in RST discourse
	parser performance are an artefact of differences in implementations of the
	evaluation procedure.
	We evaluate all these parsers with the standard Parseval procedure to provide a
	more accurate picture of the actual RST discourse parsers performance in
	standard evaluation settings.
	Under this more stringent procedure, the gains attributable to distributed
	representations represent at most a 16\% relative error reduction on
	fully-labelled structures.},
  url       = {https://www.aclweb.org/anthology/D17-1136}
}

@InProceedings{loaiciga-guillou-hardmeier:2017:EMNLP2017,
  author    = {Lo\'{a}iciga, Sharid  and  Guillou, Liane  and  Hardmeier, Christian},
  title     = {What is it? Disambiguating the different readings of the pronoun ‘it’},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1325--1331},
  abstract  = {In this paper, we address the problem of predicting one of three functions for
	the English pronoun `it': anaphoric, event reference or pleonastic. This
	disambiguation is valuable in the context of machine translation and
	coreference resolution. We present experiments using a MAXENT classifier
	trained on gold-standard data and self-training experiments of an RNN trained
	on silver-standard data, annotated using the MAXENT classifier. Lastly, we
	report on an analysis of the strengths of these two models.},
  url       = {https://www.aclweb.org/anthology/D17-1137}
}

@InProceedings{heinzerling-moosavi-strube:2017:EMNLP2017,
  author    = {Heinzerling, Benjamin  and  Moosavi, Nafise Sadat  and  Strube, Michael},
  title     = {Revisiting Selectional Preferences for Coreference Resolution},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1332--1339},
  abstract  = {Selectional preferences have long been claimed to be essential for coreference
	resolution. However, they are modeled only implicitly by current coreference
	resolvers. We propose a dependency-based embedding model of selectional
	preferences which allows fine-grained compatibility judgments with high
	coverage. Incorporating our model improves performance, matching
	state-of-the-art results of a more complex system. However, it comes with a
	cost that makes it debatable how worthwhile are such improvements.},
  url       = {https://www.aclweb.org/anthology/D17-1138}
}

@InProceedings{wang-EtAl:2017:EMNLP20172,
  author    = {Wang, Liang  and  Li, Sujian  and  Lv, Yajuan  and  WANG, Houfeng},
  title     = {Learning to Rank Semantic Coherence for Topic Segmentation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1340--1344},
  abstract  = {Topic segmentation plays an important role for discourse parsing and
	information retrieval. Due to the absence of training data, previous work
	mainly adopts unsupervised methods to rank semantic coherence between
	paragraphs for topic segmentation. In this paper, we present an intuitive and
	simple idea to automatically create a "quasi" training dataset, which includes
	a large amount of text pairs from the same or different documents with
	different semantic coherence. With the training corpus, we design a symmetric
	CNN neural network to model text pairs and rank the semantic coherence within
	the learning to rank framework. Experiments show that our algorithm is able to
	achieve competitive performance over strong baselines on several real-world
	datasets.},
  url       = {https://www.aclweb.org/anthology/D17-1139}
}

@InProceedings{shnarch-EtAl:2017:EMNLP2017,
  author    = {Shnarch, Eyal  and  Levy, Ran  and  Raykar, Vikas  and  Slonim, Noam},
  title     = {GRASP: Rich Patterns for Argumentation Mining},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1345--1350},
  abstract  = {GRASP (GReedy Augmented Sequential Patterns) is an algorithm for automatically
	extracting patterns that characterize subtle linguistic phenomena. To that end,
	GRASP augments each term of input text with multiple layers of linguistic
	information. These different facets of the text terms are systematically
	combined to reveal rich patterns. We report highly promising experimental
	results in several challenging text analysis tasks within the field of
	Argumentation Mining. We believe that GRASP is general enough to be useful for
	other domains too.
	For example, each of the following sentences includes a claim for a [topic]:
	1. Opponents often argue that the open primary is unconstitutional. [Open
	Primaries]
	2. Prof. Smith suggested that affirmative action devalues the accomplishments
	of the chosen. [Affirmative Action]
	3. The majority stated that the First Amendment does not guarantee the right to
	offend others. [Freedom of Speech]
	These sentences share almost no words in common, however, they are similar at a
	more abstract level. A human observer may notice the following underlying
	common structure, or pattern: [someone][argue/suggest/state][that][topic
	term][sentiment term].
	GRASP aims to automatically capture such underlying structures of the given
	data. For the above examples it finds the pattern
	[noun][express][that][noun,topic][sentiment], where [express] stands for all
	its (in)direct hyponyms, and [noun,topic] means a noun which is also related to
	the topic.},
  url       = {https://www.aclweb.org/anthology/D17-1140}
}

@InProceedings{alkhatib-EtAl:2017:EMNLP2017,
  author    = {Al Khatib, Khalid  and  Wachsmuth, Henning  and  Hagen, Matthias  and  Stein, Benno},
  title     = {Patterns of Argumentation Strategies across Topics},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1351--1357},
  abstract  = {This paper presents an analysis of argumentation strategies in news editorials
	within and across topics. Given nearly 29,000 argumentative editorials from the
	New York Times, we develop two machine learning models, one for determining an
	editorial's topic, and one for identifying evidence types in the editorial.
	Based on the distribution and structure of the identified types, we analyze the
	usage patterns of argumentation strategies among 12 different topics. We detect
	several common patterns that provide insights into the manifestation of
	argumentation strategies. Also, our experiments reveal clear correlations
	between the topics and the detected patterns.},
  url       = {https://www.aclweb.org/anthology/D17-1141}
}

@InProceedings{liu-EtAl:2017:EMNLP20174,
  author    = {Liu, Haijing  and  Gao, Yang  and  Lv, Pin  and  Li, Mengxue  and  Geng, Shiqiang  and  Li, Minglan  and  Wang, Hao},
  title     = {Using Argument-based Features to Predict and Analyse Review Helpfulness},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1358--1363},
  abstract  = {We study the helpful product reviews identification problem in this paper. We
	observe that the evidence-conclusion discourse relations, also known as
	arguments, often appear in product reviews, and we hypothesise that some
	argument-based features, e.g. the percentage of argumentative sentences, the
	evidences-conclusions ratios, are good indicators of helpful reviews. To
	validate this hypothesis, we manually annotate arguments in 110 hotel reviews,
	and investigate the effectiveness of several combinations of argument-based
	features. Experiments suggest that, when being used together with the
	argument-based features, the state-of-the-art baseline features can enjoy a
	performance boost (in terms of F1) of 11.01\% in average.},
  url       = {https://www.aclweb.org/anthology/D17-1142}
}

@InProceedings{potash-romanov-rumshisky:2017:EMNLP2017,
  author    = {Potash, Peter  and  Romanov, Alexey  and  Rumshisky, Anna},
  title     = {Here's My Point: Joint Pointer Architecture for Argument Mining},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1364--1373},
  abstract  = {In order to determine argument structure in text, one must understand how
	individual components of the overall argument are linked. This work presents
	the first neural network-based approach to link extraction in argument mining.
	Specifically, we propose a novel architecture that applies Pointer Network
	sequence-to-sequence attention modeling to structural prediction in discourse
	parsing tasks. We then develop a joint model that extends this architecture to
	simultaneously address the link extraction task and the classification of
	argument components. The proposed joint model achieves state-of-the-art results
	on two separate evaluation corpora, showing far superior performance than the
	previously proposed corpus-specific and heavily feature-engineered models.
	Furthermore, our results demonstrate that jointly optimizing for both tasks is
	crucial for high performance.},
  url       = {https://www.aclweb.org/anthology/D17-1143}
}

@InProceedings{cocarascu-toni:2017:EMNLP2017,
  author    = {Cocarascu, Oana  and  Toni, Francesca},
  title     = {Identifying attack and support argumentative relations using deep learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1374--1379},
  abstract  = {We propose a deep learning architecture to capture argumentative relations of
	attack and support from one piece of text to another, of the kind that
	naturally occur in a debate. The architecture uses two (unidirectional or
	bidirectional) Long Short-Term Memory networks and (trained or non-trained)
	word embeddings, and allows to considerably improve upon existing techniques
	that use syntactic features and supervised classifiers for the same form of
	(relation-based) argument mining.},
  url       = {https://www.aclweb.org/anthology/D17-1144}
}

@InProceedings{sperber-EtAl:2017:EMNLP2017,
  author    = {Sperber, Matthias  and  Neubig, Graham  and  Niehues, Jan  and  Waibel, Alex},
  title     = {Neural Lattice-to-Sequence Models for Uncertain Inputs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1380--1389},
  abstract  = {The input to a neural sequence-to-sequence model is often determined by an
	up-stream system, e.g. a word segmenter, part of speech tagger, or speech
	recognizer. These up-stream models are potentially error-prone. Representing
	inputs through word lattices allows making this uncertainty explicit by
	capturing alternative sequences and their posterior probabilities in a compact
	form.
	In this work, we extend the TreeLSTM (Tai et al., 2015) into a LatticeLSTM that
	is able to consume word lattices, and can be used as encoder in an attentional
	encoder-decoder model. We integrate lattice posterior scores into this
	architecture by extending the TreeLSTM's child-sum and forget gates and
	introducing a bias term into the attention mechanism. We experiment with speech
	translation lattices and report consistent improvements over baselines that
	translate either the 1-best hypothesis or the lattice without posterior scores.},
  url       = {https://www.aclweb.org/anthology/D17-1145}
}

@InProceedings{feng-EtAl:2017:EMNLP2017,
  author    = {Feng, Yang  and  Zhang, Shiyue  and  Zhang, Andi  and  Wang, Dong  and  Abel, Andrew},
  title     = {Memory-augmented Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1390--1399},
  abstract  = {Neural machine translation (NMT) has achieved notable success in recent times,
	however it is also widely recognized that this approach has limitations with
	handling infrequent words and word pairs. This paper presents a novel
	memory-augmented NMT (M-NMT) architecture, which stores knowledge about how
	words (usually infrequently encountered ones) should be translated in a memory
	and then utilizes them to assist the neural model. We use this memory mechanism
	to combine the knowledge learned from a conventional statistical machine
	translation system and the rules learned by an NMT system, and also propose a
	solution for out-of-vocabulary (OOV) words based on this framework. Our
	experiments on two Chinese-English translation tasks demonstrated that the
	M-NMT architecture outperformed the NMT baseline by $9.0$ and $2.7$ BLEU points
	on the two tasks, respectively. Additionally, we found this architecture
	resulted in a much more effective OOV treatment compared to competitive
	methods.},
  url       = {https://www.aclweb.org/anthology/D17-1146}
}

@InProceedings{vanderwees-bisazza-monz:2017:EMNLP2017,
  author    = {van der Wees, Marlies  and  Bisazza, Arianna  and  Monz, Christof},
  title     = {Dynamic Data Selection for Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1400--1410},
  abstract  = {Intelligent selection of training data has proven a successful technique to
	simultaneously increase training efficiency and translation performance for
	phrase-based machine translation (PBMT). With the recent increase in popularity
	of neural machine translation (NMT), we explore in this paper to what extent
	and how NMT can also benefit from data selection. While state-of-the-art data
	selection (Axelrod et al., 2011) consistently performs well for PBMT, we show
	that gains are substantially lower for NMT. Next, we introduce 'dynamic data
	selection' for NMT, a method in which we vary the selected subset of training
	data between different training epochs. Our experiments show that the best
	results are achieved when applying a technique we call 'gradual fine-tuning',
	with improvements up to +2.6 BLEU over the original data selection approach and
	up to +3.1 BLEU over a general baseline.},
  url       = {https://www.aclweb.org/anthology/D17-1147}
}

@InProceedings{dahlmann-EtAl:2017:EMNLP2017,
  author    = {Dahlmann, Leonard  and  Matusov, Evgeny  and  Petrushkov, Pavel  and  Khadivi, Shahram},
  title     = {Neural Machine Translation Leveraging Phrase-based Models in a Hybrid Search},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1411--1420},
  abstract  = {In this paper, we introduce a hybrid search for attention-based neural machine
	translation (NMT). A target phrase learned with statistical MT models extends a
	hypothesis in the NMT beam search when the attention of the NMT model focuses
	on the source words translated by this phrase. Phrases added in this way are
	scored with the NMT model, but also with SMT features including phrase-level
	translation probabilities and a target language model. Experimental results on
	German-to-English news domain and English-to-Russian e-commerce domain 
	translation tasks show that using phrase-based models in NMT search improves MT
	quality by up to 2.3\% BLEU absolute as compared to a strong NMT baseline.},
  url       = {https://www.aclweb.org/anthology/D17-1148}
}

@InProceedings{wang-EtAl:2017:EMNLP20173,
  author    = {Wang, Xing  and  Tu, Zhaopeng  and  Xiong, Deyi  and  Zhang, Min},
  title     = {Translating Phrases in Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1421--1431},
  abstract  = {Phrases play an important role in natural language understanding and machine
	translation (Sag et al., 2002; Villavicencio et al., 2005). However, it is
	difficult to integrate them into current neural machine translation (NMT) which
	reads and generates sentences word by word. In this work, we propose a method
	to translate phrases in NMT by integrating a phrase memory storing target
	phrases from a phrase-based statistical machine translation (SMT) system into
	the encoder-decoder architecture of NMT. At each decoding step, the phrase
	memory is first re-written by the SMT model, which dynamically generates
	relevant target phrases with contextual information provided by the NMT model.
	Then the proposed model reads the phrase memory to make probability estimations
	for all phrases in the phrase memory. If phrase generation is carried on, the
	NMT decoder selects an appropriate phrase from the memory to perform phrase
	translation and updates its decoding state by consuming the words in the
	selected phrase. Otherwise, the NMT decoder generates a word from the
	vocabulary as the general NMT decoder does. Experiment results on the Chinese
	to
	English translation show that the proposed model achieves significant
	improvements over the baseline on various test sets.},
  url       = {https://www.aclweb.org/anthology/D17-1149}
}

@InProceedings{yang-EtAl:2017:EMNLP20172,
  author    = {Yang, Baosong  and  Wong, Derek F.  and  Xiao, Tong  and  Chao, Lidia S.  and  Zhu, Jingbo},
  title     = {Towards Bidirectional Hierarchical Representations for Attention-based Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1432--1441},
  abstract  = {This paper proposes a hierarchical attentional neural translation model which
	focuses on enhancing source-side hierarchical representations by covering both
	local and global semantic information using a bidirectional tree-based encoder.
	To maximize the predictive likelihood of target words, a weighted variant of an
	attention mechanism is used to balance the attentive information between
	lexical and phrase vectors. Using a tree-based rare word encoding, the proposed
	model is extended to sub-word level to alleviate the out-of-vocabulary (OOV)
	problem. Empirical results reveal that the proposed model significantly
	outperforms sequence-to-sequence attention-based and tree-based neural
	translation models in English-Chinese translation tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1150}
}

@InProceedings{britz-EtAl:2017:EMNLP2017,
  author    = {Britz, Denny  and  Goldie, Anna  and  Luong, Minh-Thang  and  Le, Quoc},
  title     = {Massive Exploration of Neural Machine Translation Architectures},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1442--1451},
  abstract  = {Neural Machine Translation (NMT) has shown remarkable progress over the past
	few years, with production systems now being deployed to end-users.
	    As the field is moving rapidly, it has become unclear which elements of NMT
	architectures have a significant impact on translation quality.
	    In this work, we present a large-scale analysis of the sensitivity of NMT
	architectures to common hyperparameters. We report empirical results and
	variance numbers for several hundred experimental runs, corresponding to over
	250,000 GPU hours on a WMT English to German translation task. Our experiments
	provide practical insights into the relative importance of factors such as
	embedding size, network depth, RNN cell type, residual connections, attention
	mechanism, and decoding heuristics. As part of this contribution, we also
	release an open-source NMT framework in TensorFlow to make it easy for others
	to reproduce our results and perform their own experiments.},
  url       = {https://www.aclweb.org/anthology/D17-1151}
}

@InProceedings{wijaya-EtAl:2017:EMNLP2017,
  author    = {Wijaya, Derry Tanti  and  Callahan, Brendan  and  Hewitt, John  and  Gao, Jie  and  Ling, Xiao  and  Apidianaki, Marianna  and  Callison-Burch, Chris},
  title     = {Learning Translations via Matrix Completion},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1452--1463},
  abstract  = {Bilingual Lexicon Induction is the task of learning  word  translations 
	without  bilingual parallel corpora. We model this task as a matrix completion
	problem, and present an effective and extendable framework for completing the
	matrix. This method harnesses diverse bilingual and monolingual signals, each
	of which may be incomplete or noisy. Our model achieves state-of-the-art
	performance for both high and low resource languages.},
  url       = {https://www.aclweb.org/anthology/D17-1152}
}

@InProceedings{nguyen-daumeiii-boydgraber:2017:EMNLP2017,
  author    = {Nguyen, Khanh  and  Daum\'{e} III, Hal  and  Boyd-Graber, Jordan},
  title     = {Reinforcement Learning for Bandit Neural Machine Translation with Simulated Human Feedback},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1464--1474},
  abstract  = {Machine translation is a natural candidate
	problem for reinforcement learning from
	human feedback: users provide quick,
	dirty ratings on candidate translations to
	guide a system to improve. Yet, current
	neural machine translation training focuses
	on expensive human-generated reference
	translations. We describe a reinforcement
	learning algorithm that improves
	neural machine translation systems
	from simulated human feedback.
	Our algorithm combines the advantage
	actor-critic algorithm (Mnih et al., 2016)
	with the attention-based neural encoder-decoder
	architecture (Luong et al., 2015).
	This algorithm (a) is well-designed for
	problems with a large action space and
	delayed rewards, (b) effectively optimizes
	traditional corpus-level machine translation
	metrics, and (c) is robust to skewed,
	high-variance, granular feedback modeled
	after actual human behaviors.},
  url       = {https://www.aclweb.org/anthology/D17-1153}
}

@InProceedings{zhang-EtAl:2017:EMNLP20172,
  author    = {Zhang, Xiaowei  and  Chen, Wei  and  Wang, Feng  and  Xu, Shuang  and  Xu, Bo},
  title     = {Towards Compact and Fast Neural Machine Translation Using a Combined Method},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1475--1481},
  abstract  = {Neural Machine Translation (NMT) lays intensive burden on computation and
	memory cost. It is a challenge to deploy NMT models on the devices with limited
	computation and memory budgets. This paper presents a four stage pipeline to
	compress model and speed up the decoding for NMT. Our method first introduces a
	compact architecture based on convolutional encoder and weight shared
	embeddings. Then weight pruning is applied to obtain a sparse model. Next, we
	propose a fast sequence interpolation approach which enables the greedy
	decoding to achieve performance on par with the beam search. Hence, the
	time-consuming beam search can be replaced by simple greedy decoding. Finally,
	vocabulary selection is used to reduce the computation of softmax layer. Our
	final model achieves 10 times speedup, 17 times parameters reduction, less than
	35MB storage size and comparable performance compared to the baseline model.},
  url       = {https://www.aclweb.org/anthology/D17-1154}
}

@InProceedings{wang-EtAl:2017:EMNLP20174,
  author    = {Wang, Rui  and  Utiyama, Masao  and  Liu, Lemao  and  Chen, Kehai  and  Sumita, Eiichiro},
  title     = {Instance Weighting for Neural Machine Translation Domain Adaptation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1482--1488},
  abstract  = {Instance weighting has been widely applied to phrase-based machine translation
	domain adaptation. However, it is challenging to be applied to Neural Machine
	Translation (NMT) directly, because NMT is not a linear model. In this paper,
	two instance weighting technologies, i.e., sentence weighting and domain
	weighting with a dynamic weight learning strategy, are proposed for NMT domain
	adaptation. Empirical results on the IWSLT English-German/French tasks show
	that the proposed methods can substantially improve NMT performance by up to
	2.7-6.7 BLEU points, outperforming the existing baselines by up to 1.6-3.6 BLEU
	points.},
  url       = {https://www.aclweb.org/anthology/D17-1155}
}

@InProceedings{micelibarone-EtAl:2017:EMNLP2017,
  author    = {Miceli Barone, Antonio Valerio  and  Haddow, Barry  and  Germann, Ulrich  and  Sennrich, Rico},
  title     = {Regularization techniques for fine-tuning in neural machine translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1489--1494},
  abstract  = {We investigate techniques for supervised domain adaptation for neural machine
	translation where an existing model trained on a large out-of-domain dataset is
	adapted to a small in-domain dataset.  
	In this scenario, overfitting is a major challenge. We investigate a number of
	techniques to reduce overfitting and improve transfer learning, including
	regularization techniques such as dropout and L2-regularization towards an
	out-of-domain prior. In addition, we introduce tuneout, a novel regularization
	technique inspired by dropout.
	We apply these techniques, alone and in combination, to neural machine
	translation, obtaining improvements on IWSLT datasets for English->German and
	English$->Russian.
	We also investigate the amounts of in-domain training data needed for domain
	adaptation in NMT, and find a logarithmic relationship between the amount of
	training data and gain in BLEU score.},
  url       = {https://www.aclweb.org/anthology/D17-1156}
}

@InProceedings{chang-collins:2017:EMNLP2017,
  author    = {Chang, Yin-Wen  and  Collins, Michael},
  title     = {Source-Side Left-to-Right or Target-Side Left-to-Right? An Empirical Comparison of Two Phrase-Based Decoding Algorithms},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1495--1499},
  abstract  = {This paper describes an empirical study of the phrase-based decoding algorithm
	proposed by Chang and Collins (2017).  The algorithm produces a translation by
	processing the source-language sentence in strictly left-to-right order,
	differing from commonly used approaches that build the target-language sentence
	in left-to-right order. Our results show that the new algorithm is competitive
	with Moses (Koehn et al., 2007) in terms of both speed and BLEU scores.},
  url       = {https://www.aclweb.org/anthology/D17-1157}
}

@InProceedings{domhan-hieber:2017:EMNLP2017,
  author    = {Domhan, Tobias  and  Hieber, Felix},
  title     = {Using Target-side Monolingual Data for Neural Machine Translation through Multi-task Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1500--1505},
  abstract  = {The performance of Neural Machine Translation (NMT) models relies heavily on
	the availability of sufficient amounts of parallel data, and an efficient and
	effective way of leveraging the vastly available amounts of monolingual data
	has yet to be found.
	We propose to modify the decoder in a neural sequence-to-sequence model to
	enable multi-task learning for two strongly related tasks: target-side language
	modeling and translation.
	The decoder predicts the next target word through two channels, a target-side
	language model on the lowest layer, and an attentional recurrent model which is
	conditioned on the source representation.
	This architecture allows joint training on both large amounts of monolingual
	and moderate amounts of bilingual data to improve NMT performance.
	Initial results in the news domain for three language pairs show moderate but
	consistent improvements over a baseline trained on bilingual data only.},
  url       = {https://www.aclweb.org/anthology/D17-1158}
}

@InProceedings{marcheggiani-titov:2017:EMNLP2017,
  author    = {Marcheggiani, Diego  and  Titov, Ivan},
  title     = {Encoding Sentences with Graph Convolutional Networks for Semantic Role Labeling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1506--1515},
  abstract  = {Semantic role labeling (SRL) is the task of identifying the predicate-argument
	structure of a sentence. 
	It is typically regarded as an important step in the standard NLP pipeline.
	As the semantic representations are closely related to syntactic ones, we
	exploit syntactic information in our model.  
	We propose a version of graph convolutional networks (GCNs), a recent class of
	neural networks operating on graphs, suited to model syntactic dependency
	graphs. 
	GCNs over syntactic dependency trees are used as sentence encoders, producing
	latent feature representations of words in a sentence.
	We observe that GCN layers are complementary to LSTM ones: when we stack both
	GCN and LSTM layers, we obtain a substantial improvement over an already
	state-of-the-art LSTM SRL model, resulting in the best reported scores on the
	standard benchmark (CoNLL-2009) both for Chinese and English.},
  url       = {https://www.aclweb.org/anthology/D17-1159}
}

@InProceedings{krishnamurthy-dasigi-gardner:2017:EMNLP2017,
  author    = {Krishnamurthy, Jayant  and  Dasigi, Pradeep  and  Gardner, Matt},
  title     = {Neural Semantic Parsing with Type Constraints for Semi-Structured Tables},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1516--1526},
  abstract  = {We present a new semantic parsing model for answering compositional questions
	on semi-structured Wikipedia tables. Our parser is an encoder-decoder neural
	network with two key technical innovations: (1) a grammar for the decoder that
	only generates well-typed logical forms; and (2) an entity embedding and
	linking module that identifies entity mentions while generalizing across
	tables. We also introduce a novel method for training our neural model with
	question-answer supervision. On the WikiTableQuestions data set, our parser
	achieves a state-of-the-art accuracy of 43.3% for a single model and 45.9% for
	a 5-model ensemble, improving on the best prior score of 38.7% set by a
	15-model ensemble. These results suggest that type constraints and entity
	linking are valuable components to incorporate in neural semantic parsers.},
  url       = {https://www.aclweb.org/anthology/D17-1160}
}

@InProceedings{srivastava-labutov-mitchell:2017:EMNLP2017,
  author    = {Srivastava, Shashank  and  Labutov, Igor  and  Mitchell, Tom},
  title     = {Joint Concept Learning and Semantic Parsing from Natural Language Explanations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1527--1536},
  abstract  = {Natural language constitutes a predominant medium for much of human learning
	and pedagogy. We consider the problem of concept learning from natural language
	explanations, and a small number of labeled examples of the concept. For
	example, in learning the concept of a phishing email, one might say `this is a
	phishing email because it asks for your bank account number'. Solving this
	problem involves both learning to interpret open ended natural language
	statements, and learning the concept itself. We present a joint model for (1)
	language interpretation (semantic parsing) and (2) concept learning
	(classification) that does not require labeling statements with logical forms.
	Instead, the model prefers discriminative interpretations of statements in
	context of observable features of the data as a weak signal for parsing. On a
	dataset of email-related concepts, our approach yields across-the-board
	improvements in classification performance, with a 30% relative improvement in
	F1 score over competitive methods in the low data regime.},
  url       = {https://www.aclweb.org/anthology/D17-1161}
}

@InProceedings{rei-EtAl:2017:EMNLP2017,
  author    = {Rei, Marek  and  Bulat, Luana  and  Kiela, Douwe  and  Shutova, Ekaterina},
  title     = {Grasping the Finer Point: A Supervised Similarity Network for Metaphor Detection},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1537--1546},
  abstract  = {The ubiquity of metaphor in our everyday communication makes it an important
	problem for natural language understanding. Yet, the majority of metaphor
	processing systems to date rely on hand-engineered features and there is still
	no consensus in the field as to which features are optimal for this task. In
	this paper, we present the first deep learning architecture designed to capture
	metaphorical composition. Our results demonstrate that it outperforms the
	existing approaches in the metaphor identification task.},
  url       = {https://www.aclweb.org/anthology/D17-1162}
}

@InProceedings{keith-EtAl:2017:EMNLP2017,
  author    = {Keith, Katherine  and  Handler, Abram  and  Pinkham, Michael  and  Magliozzi, Cara  and  McDuffie, Joshua  and  O'Connor, Brendan},
  title     = {Identifying civilians killed by police with distantly supervised entity-event extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1547--1557},
  abstract  = {We propose a new, socially-impactful task for natural language processing: from
	a news corpus, extract names of persons who have been killed by police. We
	present a newly collected police fatality corpus, which we release publicly,
	and present a model to solve this problem that uses EM-based distant
	supervision with logistic regression and convolutional neural network
	classifiers. Our model outperforms two off-the-shelf event extractor systems,
	and it can suggest candidate victim names in some cases faster than one of the
	major manually-collected police fatality databases.},
  url       = {https://www.aclweb.org/anthology/D17-1163}
}

@InProceedings{zhang-spirling-danescuniculescumizil:2017:EMNLP2017,
  author    = {Zhang, Justine  and  Spirling, Arthur  and  Danescu-Niculescu-Mizil, Cristian},
  title     = {Asking too much? The rhetorical role of questions in political discourse},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1558--1572},
  abstract  = {Questions play a prominent role in social interactions, performing rhetorical
	functions that go beyond that of simple informational exchange.  The surface
	form of a question can signal the intention and background of the person asking
	it, as well as the nature of their relation with the interlocutor.  While the
	informational nature of questions has been extensively examined in the context
	of question-answering applications, their rhetorical aspects have been largely
	understudied.
	In this work we introduce an unsupervised methodology for extracting surface
	motifs that recur in questions, and for grouping them according to their latent
	rhetorical role.  By applying this framework to the setting of question
	sessions in the UK parliament, we show that the resulting typology encodes key
	aspects of the political discourse---such as the bifurcation in questioning
	behavior between government and opposition parties---and reveals new insights
	into the effects of a legislator's tenure and political career ambitions.},
  url       = {https://www.aclweb.org/anthology/D17-1164}
}

@InProceedings{vilares-he:2017:EMNLP2017,
  author    = {Vilares, David  and  He, Yulan},
  title     = {Detecting Perspectives in Political Debates},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1573--1582},
  abstract  = {We explore how to detect people's perspectives that occupy a certain
	proposition. We propose a Bayesian modelling approach where topics (or
	propositions) and their associated perspectives (or viewpoints) are modeled as
	latent variables. Words associated with topics or perspectives follow different
	generative routes. Based on the extracted perspectives, we can extract the top
	associated sentences from text to generate a succinct summary which allows a
	quick glimpse of the main viewpoints in a document. The model is evaluated on
	debates from the House of Commons of the UK Parliament, revealing perspectives
	from the debates without the use of labelled data and obtaining better results
	than previous related solutions under a variety of evaluations.},
  url       = {https://www.aclweb.org/anthology/D17-1165}
}

@InProceedings{swamy-ritter-demarneffe:2017:EMNLP2017,
  author    = {Swamy, Sandesh  and  Ritter, Alan  and  de Marneffe, Marie-Catherine},
  title     = {"i have a feeling trump will win..................": Forecasting Winners and Losers from User Predictions on Twitter},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1583--1592},
  abstract  = {Social media users often make explicit predictions about upcoming events. Such
	statements vary in the degree of certainty the author expresses toward the
	outcome: "Leonardo DiCaprio will win Best Actor" vs. "Leonardo DiCaprio may
	win" or "No way Leonardo wins!". Can popular beliefs on social media predict
	who will win? To answer this question, we build a corpus of tweets annotated
	for veridicality on which we train a log-linear classifier that detects
	positive veridicality with high precision. We then forecast uncertain outcomes
	using the wisdom of crowds, by aggregating users' explicit predictions. Our
	method for forecasting winners is fully automated, relying only on a set of
	contenders as input. It requires no training data of past outcomes and
	outperforms sentiment and tweet volume baselines on a broad range of contest
	prediction tasks. We further demonstrate how our approach can be used to
	measure the reliability of individual accounts' predictions and retrospectively
	identify surprise outcomes.},
  url       = {https://www.aclweb.org/anthology/D17-1166}
}

@InProceedings{gui-EtAl:2017:EMNLP20171,
  author    = {Gui, Lin  and  Hu, Jiannan  and  He, Yulan  and  Xu, Ruifeng  and  Qin, Lu  and  Du, Jiachen},
  title     = {A Question Answering Approach for Emotion Cause Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1593--1602},
  abstract  = {Emotion cause extraction aims to identify the reasons behind a certain emotion
	expressed in text. It is a much more difficult task compared to emotion
	classification. Inspired by recent advances in using deep memory networks for
	question answering (QA), we propose a new approach which considers emotion
	cause identification as a reading comprehension task in QA. Inspired by
	convolutional neural networks, we propose a new mechanism to store relevant
	context in different memory slots to model context information. Our proposed
	approach can extract both word level sequence features and lexical features.
	Performance evaluation shows that our method achieves the state-of-the-art
	performance on a recently released emotion cause dataset, outperforming a
	number of competitive baselines by at least 3.01% in F-measure.},
  url       = {https://www.aclweb.org/anthology/D17-1167}
}

@InProceedings{chaturvedi-peng-roth:2017:EMNLP2017,
  author    = {Chaturvedi, Snigdha  and  Peng, Haoruo  and  Roth, Dan},
  title     = {Story Comprehension for Predicting What Happens Next},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1603--1614},
  abstract  = {Automatic story comprehension is a fundamental challenge in Natural Language
	Understanding, and can enable computers to learn about social norms, human
	behavior and commonsense. In this paper, we present a story comprehension model
	that explores three distinct semantic aspects: (i) the sequence of events
	described in the story, (ii) its emotional trajectory, and (iii) its plot
	consistency. We judge the model's understanding of real-world stories by
	inquiring if, like humans, it can develop an expectation of what will happen
	next in a given story. Specifically, we use it to predict the correct ending of
	a given short story from possible alternatives. The model uses a hidden
	variable to weigh the semantic aspects in the context of the story. Our
	experiments demonstrate the potential of our approach to characterize these
	semantic aspects, and the strength of the hidden variable based approach. The
	model outperforms the state-of-the-art approaches and achieves best results on
	a publicly available dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1168}
}

@InProceedings{felbo-EtAl:2017:EMNLP2017,
  author    = {Felbo, Bjarke  and  Mislove, Alan  and  S{\o}gaard, Anders  and  Rahwan, Iyad  and  Lehmann, Sune},
  title     = {Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1615--1625},
  abstract  = {NLP tasks are often limited by scarcity of manually annotated data. In social
	media sentiment analysis and related tasks, researchers have therefore used
	binarized emoticons and specific hashtags as forms of distant supervision. Our
	paper shows that by extending the distant supervision to a more diverse set of
	noisy labels, the models can learn richer representations. Through emoji
	prediction on a dataset of 1246 million tweets containing one of 64 common
	emojis we obtain state-of-the-art performance on 8 benchmark datasets
	within emotion, sentiment and sarcasm detection using a single pretrained
	model. Our analyses confirm that the diversity of our emotional labels yield a
	performance improvement over previous distant supervision approaches.},
  url       = {https://www.aclweb.org/anthology/D17-1169}
}

@InProceedings{wang-zhang:2017:EMNLP2017,
  author    = {Wang, Zhongqing  and  Zhang, Yue},
  title     = {Opinion Recommendation Using A Neural Model},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1626--1637},
  abstract  = {We present opinion recommendation, a novel task of jointly generating a review
	with a rating score that a certain user would give to a certain product which
	is unreviewed by the user, given existing reviews to the product by other
	users, and the reviews that the user has given to other products. A
	characteristic of opinion recommendation is the reliance of multiple data
	sources for multi-task joint learning. We use a single neural network to model
	users and products, generating customised product representations using a deep
	memory network, from which customised ratings and reviews are constructed
	jointly. Results show that our opinion recommendation system gives ratings that
	are closer to real user ratings on Yelp.com data compared with Yelp's own
	ratings. our methods give better results compared to several pipelines
	baselines.},
  url       = {https://www.aclweb.org/anthology/D17-1170}
}

@InProceedings{cai-jiang-tu:2017:EMNLP2017,
  author    = {Cai, Jiong  and  Jiang, Yong  and  Tu, Kewei},
  title     = {CRF Autoencoder for Unsupervised Dependency Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1638--1643},
  abstract  = {Unsupervised dependency parsing, which tries to discover linguistic dependency
	structures from unannotated data, is a very challenging task. Almost all
	previous work on this task focuses on learning generative models. In this
	paper, we develop an unsupervised dependency parsing model based on the CRF
	autoencoder. The encoder part of our model is discriminative and globally
	normalized which allows us to use rich features as well as universal linguistic
	priors. We propose an exact algorithm for parsing as well as a tractable
	learning algorithm. We evaluated the performance of our model on eight
	multilingual treebanks and found that our model achieved comparable performance
	with state-of-the-art approaches.},
  url       = {https://www.aclweb.org/anthology/D17-1171}
}

@InProceedings{corro-leroux-lacroix:2017:EMNLP2017,
  author    = {Corro, Caio  and  Le Roux, Joseph  and  Lacroix, Mathieu},
  title     = {Efficient Discontinuous Phrase-Structure Parsing via the Generalized Maximum Spanning Arborescence},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1644--1654},
  abstract  = {We present a new method for the joint task of tagging and non-projective
	dependency parsing. We demonstrate its usefulness with an application to
	discontinuous phrase-structure parsing where decoding lexicalized spines and
	syntactic derivations is performed jointly. The main contributions of this
	paper are (1) a reduction from joint tagging and non-projective dependency
	parsing to the Generalized Maximum Spanning Arborescence problem, and (2) a
	novel decoding algorithm for this problem through Lagrangian relaxation. We
	evaluate this model and obtain state-of-the-art results despite strong
	independence assumptions.},
  url       = {https://www.aclweb.org/anthology/D17-1172}
}

@InProceedings{zheng:2017:EMNLP2017,
  author    = {Zheng, Xiaoqing},
  title     = {Incremental Graph-based Neural Dependency Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1655--1665},
  abstract  = {Very recently, some studies on neural dependency parsers have shown advantage
	over the traditional ones on a wide variety of languages. However, for
	graph-based neural dependency parsing systems, they either count on the
	long-term memory and attention mechanism to implicitly capture the high-order
	features or give up the global exhaustive inference algorithms in order to
	harness the features over a rich history of parsing decisions. The former might
	miss out the important features for specific headword predictions without the
	help of the explicit structural information, and the latter may suffer from the
	error propagation as false early structural constraints are used to create
	features when making future predictions. We explore the feasibility of
	explicitly taking high-order features into account while remaining the main
	advantage of global inference and learning for graph-based parsing. The
	proposed parser first forms an initial parse tree by head-modifier predictions
	based on the first-order factorization. High-order features (such as
	grandparent, sibling, and uncle) then can be defined over the initial tree, and
	used to refine the parse tree in an iterative fashion. Experimental results
	showed that our model (called INDP) archived competitive performance to
	existing benchmark parsers on both English and Chinese datasets.},
  url       = {https://www.aclweb.org/anthology/D17-1173}
}

@InProceedings{stanojevic-garridoalhama:2017:EMNLP2017,
  author    = {Stanojevi\'{c}, Milo\v{s}  and  Garrido Alhama, Raquel},
  title     = {Neural Discontinuous Constituency Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1666--1676},
  abstract  = {One of the most pressing issues in discontinuous constituency transition-based
	parsing is that the relevant information for parsing decisions could be located
	in any part of the stack or the buffer. 
	In this paper, we propose a solution to this problem by replacing the
	structured perceptron model with a recursive neural model that computes a
	global representation of the configuration, therefore allowing even the most
	remote parts of the configuration to influence the parsing decisions. We also
	provide a detailed analysis of how this representation should be built out of
	sub-representations of its core elements (words, trees and stack).
	 Additionally, we investigate how different types of swap oracles influence the
	results.  Our model is the first neural discontinuous constituency parser, and
	it outperforms all the previously published models on three out of four
	datasets while on the fourth it obtains second place by a tiny difference.},
  url       = {https://www.aclweb.org/anthology/D17-1174}
}

@InProceedings{zhang-EtAl:2017:EMNLP20173,
  author    = {Zhang, Zhirui  and  Liu, Shujie  and  Li, Mu  and  Zhou, Ming  and  Chen, Enhong},
  title     = {Stack-based Multi-layer Attention for Transition-based Dependency Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1677--1682},
  abstract  = {Although sequence-to-sequence (seq2seq) network has achieved significant
	success in many NLP tasks such as machine translation and text summarization,
	simply applying this approach to transition-based dependency parsing cannot
	yield a comparable performance gain as in other state-of-the-art methods, such
	as stack-LSTM and head selection. In this paper, we propose a stack-based
	multi-layer attention model for seq2seq learning to better leverage structural
	linguistics information. In our method, two binary vectors are used to track
	the decoding stack in transition-based parsing, and multi-layer attention is
	introduced to capture multiple word dependencies in partial trees. We conduct
	experiments on PTB and CTB datasets, and the results show that our proposed
	model achieves state-of-the-art accuracy and significant improvement in labeled
	precision with respect to the baseline seq2seq model.},
  url       = {https://www.aclweb.org/anthology/D17-1175}
}

@InProceedings{han-jiang-tu:2017:EMNLP2017,
  author    = {Han, Wenjuan  and  Jiang, Yong  and  Tu, Kewei},
  title     = {Dependency Grammar Induction with Neural Lexicalization and Big Training Data},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1683--1688},
  abstract  = {We study the impact of big models (in terms of the degree of lexicalization)
	and big data (in terms of the training corpus size) on dependency grammar
	induction.
	We experimented with L-DMV, a lexicalized version of Dependency Model with
	Valence \cite{Klein:2004:CIS:1218955.1219016} and L-NDMV, our lexicalized
	extension of the Neural Dependency Model with Valence
	\cite{jiang-han-tu:2016:EMNLP2016}. 
	We find that L-DMV only benefits from very small degrees of lexicalization and
	moderate sizes of training corpora. L-NDMV can benefit from big training data
	and lexicalization of greater degrees, especially when enhanced with good model
	initialization, and it achieves a result that is competitive with the current
	state-of-the-art.},
  url       = {https://www.aclweb.org/anthology/D17-1176}
}

@InProceedings{jiang-han-tu:2017:EMNLP2017,
  author    = {Jiang, Yong  and  Han, Wenjuan  and  Tu, Kewei},
  title     = {Combining Generative and Discriminative Approaches to Unsupervised Dependency Parsing via Dual Decomposition},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1689--1694},
  abstract  = {Unsupervised dependency parsing aims to learn a dependency parser from
	unannotated sentences. Existing work focuses on either learning generative
	models using the expectation-maximization algorithm and its variants, or
	learning discriminative models using the discriminative clustering algorithm.
	In this paper, we propose a new learning strategy that learns a generative
	model and a discriminative model jointly based on the dual decomposition
	method. Our method is simple and general, yet effective to capture the
	advantages of both models and improve their learning results. We tested our
	method on the UD treebank and achieved a state-of-the-art performance on thirty
	languages.},
  url       = {https://www.aclweb.org/anthology/D17-1177}
}

@InProceedings{stern-fried-klein:2017:EMNLP2017,
  author    = {Stern, Mitchell  and  Fried, Daniel  and  Klein, Dan},
  title     = {Effective Inference for Generative Neural Parsing},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1695--1700},
  abstract  = {Generative neural models have recently achieved state-of-the-art results for
	constituency parsing. However, without a feasible search procedure, their use
	has so far been limited to reranking the output of external parsers in which
	decoding is more tractable. We describe an alternative to the conventional
	action-level beam search used for discriminative neural models that enables us
	to decode directly in these generative models. We then show that by improving
	our basic candidate selection strategy and using a coarse pruning function, we
	can improve accuracy while exploring significantly less of the search space.
	Applied to the model of Choe and Charniak (2016), our inference procedure
	obtains 92.56 F1 on section 23 of the Penn Treebank, surpassing prior
	state-of-the-art results for single-model systems.},
  url       = {https://www.aclweb.org/anthology/D17-1178}
}

@InProceedings{zhang-EtAl:2017:EMNLP20174,
  author    = {Zhang, Xiao  and  Jiang, Yong  and  Peng, Hao  and  Tu, Kewei  and  Goldwasser, Dan},
  title     = {Semi-supervised Structured Prediction with Neural CRF Autoencoder},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1701--1711},
  abstract  = {In this paper we propose an end-to-end neural CRF autoencoder (NCRF-AE) model
	for semi-supervised learning of sequential structured prediction problems. Our
	NCRF-AE consists of two parts: an encoder which is a CRF model enhanced by deep
	neural networks, and a decoder which is a generative model trying to
	reconstruct the input. Our model has a unified structure with different loss
	functions for labeled and unlabeled data with shared parameters. We developed a
	variation of the EM algorithm for optimizing both the encoder and the decoder
	simultaneously by decoupling their parameters. Our Experimental results over
	the Part-of-Speech (POS) tagging task on eight different languages, show that
	our model can outperform competitive systems in both supervised and
	semi-supervised scenarios.},
  url       = {https://www.aclweb.org/anthology/D17-1179}
}

@InProceedings{kasai-EtAl:2017:EMNLP2017,
  author    = {Kasai, Jungo  and  Frank, Bob  and  McCoy, Tom  and  Rambow, Owen  and  Nasr, Alexis},
  title     = {TAG Parsing with Neural Networks and Vector Representations of Supertags},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1712--1722},
  abstract  = {We present supertagging-based models for Tree Adjoining Grammar parsing that
	use neural network architectures and dense vector representation of supertags
	(elementary trees) to achieve state-of-the-art performance in unlabeled and
	labeled attachment scores. The shift-reduce parsing model eschews lexical
	information entirely, and uses only the 1-best supertags to parse a sentence,
	providing further support for the claim that supertagging is "almost parsing."
	We demonstrate that the embedding vector representations the parser induces for
	supertags possess linguistically interpretable structure, supporting analogies
	between grammatical structures like those familiar from recent work in
	distributional semantics. This dense representation of supertags overcomes the
	drawbacks for statistical models of TAG as compared to CCG parsing, raising the
	possibility that TAG is a viable alternative for NLP tasks that require the
	assignment of richer structural descriptions to sentences.},
  url       = {https://www.aclweb.org/anthology/D17-1180}
}

@InProceedings{adel-schutze:2017:EMNLP2017,
  author    = {Adel, Heike  and  Sch\"{u}tze, Hinrich},
  title     = {Global Normalization of Convolutional Neural Networks for Joint Entity and Relation Classification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1723--1729},
  abstract  = {We introduce globally normalized convolutional neural networks for joint entity
	classification and relation extraction. In particular, we propose a way to
	utilize a linear-chain conditional random field output layer for predicting
	entity types and relations between entities at the same time. Our experiments
	show that global normalization outperforms a locally normalized softmax layer
	on a benchmark dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1181}
}

@InProceedings{zhang-zhang-fu:2017:EMNLP2017,
  author    = {Zhang, Meishan  and  Zhang, Yue  and  Fu, Guohong},
  title     = {End-to-End Neural Relation Extraction with Global Optimization},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1730--1740},
  abstract  = {Neural networks have shown promising results for relation extraction.
	State-of-the-art models cast the task as an end-to-end problem, 
	solved incrementally using a local classifier.
	Yet previous work using statistical models have demonstrated that global
	optimization can achieve better performances compared to local classification.
	We build a globally optimized neural model for end-to-end relation extraction,
	proposing novel LSTM features in order to better learn context representations.
	In addition, we present a novel method to integrate syntactic information to
	facilitate global learning, yet requiring little background on syntactic
	grammars thus being easy to extend. Experimental results show that our proposed
	model is highly effective,
	achieving the best performances on two standard benchmarks.},
  url       = {https://www.aclweb.org/anthology/D17-1182}
}

@InProceedings{ojha-talukdar:2017:EMNLP2017,
  author    = {Ojha, Prakhar  and  Talukdar, Partha},
  title     = {KGEval: Accuracy Estimation of Automatically Constructed Knowledge Graphs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1741--1750},
  abstract  = {Automatic construction of large knowledge graphs (KG) by mining web-scale text
	datasets has received considerable attention recently. Estimating accuracy of
	such automatically constructed KGs is a challenging problem due to their size
	and diversity. This important problem has largely been ignored in prior
	research -- we fill this gap and propose KGEval. KGEval uses coupling
	constraints to bind facts and crowdsources those few that can infer large parts
	of the graph. We demonstrate that the objective optimized by KGEval is
	submodular and NP-hard, allowing guarantees for our approximation algorithm.
	Through experiments on real-world datasets, we demonstrate that KGEval best
	estimates KG accuracy compared to other baselines, while requiring
	significantly lesser number of human evaluations.},
  url       = {https://www.aclweb.org/anthology/D17-1183}
}

@InProceedings{pujara-augustine-getoor:2017:EMNLP2017,
  author    = {Pujara, Jay  and  Augustine, Eriq  and  Getoor, Lise},
  title     = {Sparsity and Noise: Where Knowledge Graph Embeddings Fall Short},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1751--1756},
  abstract  = {Knowledge graph (KG) embedding techniques use structured relationships between
	entities to learn low-dimensional representations of entities and relations.
	One prominent goal of these approaches is to improve the quality of knowledge
	graphs by removing errors and adding missing facts. Surprisingly, most
	embedding techniques have been evaluated on benchmark datasets consisting of
	dense and reliable subsets of human-curated KGs, which tend to be fairly
	complete and have few errors. In this paper, we consider the problem of
	applying embedding techniques to KGs extracted from text, which are often
	incomplete and contain errors. We compare the sparsity and unreliability of
	different KGs and perform empirical experiments demonstrating how embedding
	approaches degrade as sparsity and unreliability increase.},
  url       = {https://www.aclweb.org/anthology/D17-1184}
}

@InProceedings{glavavs-ponzetto:2017:EMNLP2017,
  author    = {Glava\v{s}, Goran  and  Ponzetto, Simone Paolo},
  title     = {Dual Tensor Model for Detecting Asymmetric Lexico-Semantic Relations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1757--1767},
  abstract  = {Detection of lexico-semantic relations is one of the central tasks of
	computational semantics. Although some fundamental relations (e.g., hypernymy)
	are asymmetric, most existing models account for asymmetry only implicitly and
	use the same concept representations to support detection of symmetric and
	asymmetric relations alike. In this work, we propose the Dual Tensor model, a
	neural architecture with which we explicitly model the asymmetry and capture
	the translation between unspecialized and specialized word embeddings via a
	pair of tensors. Although our Dual Tensor model needs only unspecialized
	embeddings as input, our experiments on hypernymy and meronymy detection
	suggest that it can outperform more complex and resource-intensive models. We
	further demonstrate that the model can account for polysemy and that it
	exhibits stable performance across languages.},
  url       = {https://www.aclweb.org/anthology/D17-1185}
}

@InProceedings{zeng-EtAl:2017:EMNLP2017,
  author    = {Zeng, Wenyuan  and  Lin, Yankai  and  Liu, Zhiyuan  and  Sun, Maosong},
  title     = {Incorporating Relation Paths in Neural Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1768--1777},
  abstract  = {Distantly supervised relation extraction has been widely used to find novel
	relational facts from plain text. To predict the relation between a pair of two
	target entities, existing methods solely rely on those direct sentences
	containing both entities. In fact, there are also many sentences containing
	only one of the target entities, which also provide rich useful information but
	not yet employed by relation extraction. To address this issue, we build
	inference chains between two target entities via intermediate entities, and
	propose a path-based neural relation extraction model to encode the relational
	semantics from both direct sentences and inference chains. Experimental results
	on real-world datasets show that, our model can make full use of those
	sentences containing only one target entity, and achieves significant and
	consistent improvements on relation extraction as compared with strong
	baselines. The source code of this paper can be obtained from https://
	github.com/thunlp/PathNRE.
	Author{4}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1186}
}

@InProceedings{wu-bamman-russell:2017:EMNLP2017,
  author    = {Wu, Yi  and  Bamman, David  and  Russell, Stuart},
  title     = {Adversarial Training for Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1778--1783},
  abstract  = {Adversarial training is a mean of regularizing classification algorithms by
	generating adversarial noise to the training data. We apply adversarial
	training in relation extraction within the multi-instance multi-label learning
	framework. We evaluate various neural network architectures on two different
	datasets. Experimental results demonstrate that adversarial training is
	generally effective for both CNN and RNN models and significantly improves the
	precision of predicted relations.},
  url       = {https://www.aclweb.org/anthology/D17-1187}
}

@InProceedings{sorokin-gurevych:2017:EMNLP2017,
  author    = {Sorokin, Daniil  and  Gurevych, Iryna},
  title     = {Context-Aware Representations for Knowledge Base Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1784--1789},
  abstract  = {We demonstrate that for sentence-level relation extraction it is beneficial to
	consider other relations in the sentential context while predicting the target
	relation. Our architecture uses an LSTM-based encoder to jointly learn
	representations for all relations in a single sentence.  We combine the context
	representations with an attention mechanism to make the final prediction. 
	We use the Wikidata knowledge base to construct a dataset of multiple relations
	per sentence and to evaluate our approach. Compared to a baseline system, our
	method results in an average error reduction of 24 on a held-out set of
	relations.
	The code and the dataset to replicate the experiments are made available at
	https://github.com/ukplab/.},
  url       = {https://www.aclweb.org/anthology/D17-1188}
}

@InProceedings{liu-EtAl:2017:EMNLP20175,
  author    = {Liu, Tianyu  and  Wang, Kexiang  and  Chang, Baobao  and  Sui, Zhifang},
  title     = {A Soft-label Method for Noise-tolerant Distantly Supervised Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1790--1795},
  abstract  = {Distant-supervised relation extraction inevitably suffers from wrong labeling
	problems because it heuristically labels relational facts with knowledge bases.
	Previous sentence level denoise models don’t achieve satisfying performances
	because they use hard labels which are determined by distant supervision and
	immutable during training. To this end, we introduce an entity-pair level
	denoise method which exploits semantic information from correctly labeled
	entity pairs to correct wrong labels dynamically during training. We propose a
	joint score
	function which combines the relational scores based on the entity-pair
	representation and the confidence of the hard label to obtain a new label,
	namely a soft label, for certain entity pair. During training, soft labels
	instead of hard labels serve as gold labels. Experiments on the benchmark
	dataset show that our method dramatically reduces noisy instances and
	outperforms other state-of-the-art systems.
	Author{4}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1189}
}

@InProceedings{choubey-huang:2017:EMNLP20171,
  author    = {Choubey, Prafulla Kumar  and  Huang, Ruihong},
  title     = {A Sequential Model for Classifying Temporal Relations between Intra-Sentence Events},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1796--1802},
  abstract  = {We present a sequential model for temporal relation classification between
	intra-sentence events. The key observation is that the overall syntactic
	structure and compositional meanings of the multi-word context between events
	are important for distinguishing among fine-grained temporal relations.
	Specifically, our approach first extracts a sequence of context words that
	indicates the temporal relation between two events, which well align with the
	dependency path between two event mentions. The context word sequence, together
	with a parts-of-speech tag sequence and a dependency relation sequence that are
	generated corresponding to the word sequence, are then provided as input to
	bidirectional recurrent neural network (LSTM) models. The neural nets learn
	compositional syntactic and semantic representations of contexts surrounding
	the two events and predict the temporal relation between them. Evaluation of
	the proposed approach on TimeBank corpus shows that sequential modeling is
	capable of accurately recognizing temporal relations between events, which
	outperforms a neural net model using various discrete features as input that
	imitates previous feature based models.},
  url       = {https://www.aclweb.org/anthology/D17-1190}
}

@InProceedings{huang-wang:2017:EMNLP2017,
  author    = {Huang, YiYao  and  Wang, William Yang},
  title     = {Deep Residual Learning for Weakly-Supervised Relation Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1803--1807},
  abstract  = {Deep residual learning (ResNet) is a new method for training very deep neural
	networks using identity mapping for shortcut connections. ResNet has won the
	ImageNet ILSVRC 2015 classification task, and achieved state-of-the-art
	performances in many computer vision tasks. However, the effect of residual
	learning on noisy natural language processing tasks is still not well
	understood. In this paper, we design a novel convolutional neural network (CNN)
	with residual learning, and investigate its impacts on the task of distantly
	supervised noisy relation extraction.  In contradictory to popular beliefs that
	ResNet only works well for very deep networks, we found  that even with 9
	layers of CNNs, using identity mapping could significantly improve the
	performance for distantly-supervised relation extraction.},
  url       = {https://www.aclweb.org/anthology/D17-1191}
}

@InProceedings{zhang-wang:2017:EMNLP2017,
  author    = {Zhang, Qing  and  Wang, Houfeng},
  title     = {Noise-Clustered Distant Supervision for Relation Extraction: A Nonparametric Bayesian Perspective},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1808--1813},
  abstract  = {For the task of relation extraction, distant supervision is an efficient
	approach to generate labeled data by aligning knowledge base with free texts.
	The essence of it is a challenging incomplete multi-label classification
	problem with sparse and noisy features. To address the challenge, this work
	presents a novel nonparametric Bayesian formulation for the task. Experiment
	results show substantially higher top precision improvements over the
	traditional state-of-the-art approaches.},
  url       = {https://www.aclweb.org/anthology/D17-1192}
}

@InProceedings{gabor-EtAl:2017:EMNLP2017,
  author    = {G\'{a}bor, Kata  and  Zargayouna, Haifa  and  Tellier, Isabelle  and  Buscaldi, Davide  and  Charnois, Thierry},
  title     = {Exploring Vector Spaces for Semantic Relations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1814--1823},
  abstract  = {Word embeddings are used with success for a variety of tasks involving lexical
	semantic similarities between individual words. Using unsupervised methods and
	just cosine similarity, encouraging results were obtained for analogical
	similarities. In this paper, we explore the potential of pre-trained word
	embeddings to identify generic types of semantic relations in an unsupervised
	experiment. We propose a new relational similarity measure based on the
	combination of word2vec's CBOW input and output vectors which outperforms
	concurrent vector representations, when used for unsupervised clustering on
	SemEval 2010 Relation Classification data.},
  url       = {https://www.aclweb.org/anthology/D17-1193}
}

@InProceedings{kutuzov-velldal-ovrelid:2017:EMNLP2017,
  author    = {Kutuzov, Andrey  and  Velldal, Erik  and  {\O}vrelid, Lilja},
  title     = {Temporal dynamics of semantic relations in word embeddings: an application to predicting armed conflict participants},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1824--1829},
  abstract  = {This paper deals with using word embedding models to trace the temporal
	dynamics of semantic relations between pairs of words. The set-up is similar to
	the well-known analogies task, but expanded with a time dimension. To this end,
	we apply incremental updating of the models with new training texts, including
	incremental vocabulary expansion, coupled with learned transformation matrices
	that let us map between members of the relation.
	The proposed approach is evaluated on the task of predicting insurgent armed
	groups based on geographical locations. The gold standard data for the time
	span 1994--2010 is extracted from the UCDP Armed Conflicts dataset. The results
	show that the method is feasible and outperforms the baselines, but also that
	important work still remains to be done.},
  url       = {https://www.aclweb.org/anthology/D17-1194}
}

@InProceedings{ji-EtAl:2017:EMNLP2017,
  author    = {Ji, Yangfeng  and  Tan, Chenhao  and  Martschat, Sebastian  and  Choi, Yejin  and  Smith, Noah A.},
  title     = {Dynamic Entity Representations in Neural Language Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1830--1839},
  abstract  = {Understanding a long document requires tracking how entities are introduced and
	evolve over time. We present a new type of language model, EntityNLM, that can
	explicitly model entities, dynamically update their representations, and
	contextually generate their mentions. Our model is generative and flexible; it
	can model an arbitrary number of entities in context while generating each
	entity mention at an arbitrary length. In addition, it can be used for several
	different tasks such as language modeling, coreference resolution, and entity
	prediction. Experimental results with all these tasks demonstrate that our
	model consistently outperforms strong baselines and prior work.},
  url       = {https://www.aclweb.org/anthology/D17-1195}
}

@InProceedings{basile-tamburini:2017:EMNLP2017,
  author    = {Basile, Ivano  and  Tamburini, Fabio},
  title     = {Towards Quantum Language Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1840--1849},
  abstract  = {This paper presents a new approach for building Language Models using the
	Quantum Probability Theory, a Quantum Language Model (QLM). It mainly shows
	that relying on this probability calculus it is possible to build stochastic
	models able to benefit from quantum correlations due to interference and
	entanglement. We extensively tested our approach showing its superior
	performances, both in terms of model perplexity and inserting it into an
	automatic speech recognition evaluation setting, when compared with
	state-of-the-art language modelling techniques.},
  url       = {https://www.aclweb.org/anthology/D17-1196}
}

@InProceedings{yang-EtAl:2017:EMNLP20173,
  author    = {Yang, Zichao  and  Blunsom, Phil  and  Dyer, Chris  and  Ling, Wang},
  title     = {Reference-Aware Language Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1850--1859},
  abstract  = {We propose a general class of language models that treat reference as discrete
	stochastic latent variables. This decision allows for the creation of entity
	mentions by accessing external databases of referents (required by, e.g.,
	dialogue generation) or past internal state (required to explicitly model
	coreferentiality). Beyond simple copying, our coreference model can
	additionally refer to a referent using varied mention forms (e.g., a reference
	to “Jane” can be realized as “she”), a characteristic feature of
	reference in natural languages. Experiments on three representative
	applications show our model variants outperform models based on deterministic
	attention and standard language modeling baselines.},
  url       = {https://www.aclweb.org/anthology/D17-1197}
}

@InProceedings{melamud-dagan-goldberger:2017:EMNLP2017,
  author    = {Melamud, Oren  and  Dagan, Ido  and  Goldberger, Jacob},
  title     = {A Simple Language Model based on PMI Matrix Approximations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1860--1865},
  abstract  = {In this study, we introduce a new approach for learning language models by
	training them to estimate word-context pointwise mutual information (PMI), and
	then deriving the desired conditional probabilities from PMI at test time.
	Specifically, we show that with minor modifications to word2vec's algorithm, we
	get principled language models that are closely related to the well-established
	Noise Contrastive Estimation (NCE) based language models. A compelling aspect
	of our approach is that our models are trained with the same simple negative
	sampling objective function that is commonly used in word2vec to learn word
	embeddings.},
  url       = {https://www.aclweb.org/anthology/D17-1198}
}

@InProceedings{assylbekov-EtAl:2017:EMNLP2017,
  author    = {Assylbekov, Zhenisbek  and  Takhanov, Rustem  and  Myrzakhmetov, Bagdat  and  Washington, Jonathan N.},
  title     = {Syllable-aware Neural Language Models: A Failure to Beat Character-aware Ones},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1866--1872},
  abstract  = {Syllabification does not seem to improve word-level RNN language modeling
	quality when compared to character-based segmentation. However, our best
	syllable-aware language model, achieving performance comparable to the
	competitive character-aware model, has 18%-33% fewer parameters and is trained
	1.2-2.2 times faster.},
  url       = {https://www.aclweb.org/anthology/D17-1199}
}

@InProceedings{frermann-szarvas:2017:EMNLP2017,
  author    = {Frermann, Lea  and  Szarvas, Gy\"{o}rgy},
  title     = {Inducing Semantic Micro-Clusters from Deep Multi-View Representations of Novels},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1873--1883},
  abstract  = {Automatically understanding the plot of novels is important both for informing
	literary scholarship and applications such as summarization or recommendation.
	Various models have addressed this task, but their evaluation has remained
	largely intrinsic and qualitative. Here, we propose a principled and scalable
	framework leveraging expert-provided semantic tags (e.g., mystery, pirates) to
	evaluate plot representations in an extrinsic fashion, assessing their ability
	to produce locally coherent groupings of novels (micro-clusters) in model
	space. We present a deep recurrent autoencoder model that learns richly
	structured multi-view plot representations, and show that they i) yield better
	micro-clusters than less structured representations; and ii) are interpretable,
	and thus useful for further literary analysis or labeling of the emerging
	micro-clusters.},
  url       = {https://www.aclweb.org/anthology/D17-1200}
}

@InProceedings{li-EtAl:2017:EMNLP20172,
  author    = {Li, Shen  and  Zhao, Zhe  and  Liu, Tao  and  Hu, Renfen  and  Du, Xiaoyong},
  title     = {Initializing Convolutional Filters with Semantic Features for Text Classification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1884--1889},
  abstract  = {Convolutional Neural Networks (CNNs) are widely used in NLP tasks. This paper
	presents a novel weight initialization method to improve the CNNs for text
	classification. Instead of randomly initializing the convolutional filters, we
	encode semantic features into them, which helps the model focus on learning
	useful features at the beginning of the training. Experiments demonstrate the
	effectiveness of the initialization technique on seven text classification
	tasks, including sentiment analysis and topic classification.},
  url       = {https://www.aclweb.org/anthology/D17-1201}
}

@InProceedings{nikolentzos-EtAl:2017:EMNLP2017,
  author    = {Nikolentzos, Giannis  and  Meladianos, Polykarpos  and  Rousseau, Francois  and  Stavrakas, Yannis  and  Vazirgiannis, Michalis},
  title     = {Shortest-Path Graph Kernels for Document Similarity},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1890--1900},
  abstract  = {In this paper, we present a novel document similarity measure based on the
	definition of a graph kernel between pairs of documents. The proposed measure
	takes into account both the terms contained in the documents and the
	relationships between them. By representing each document as a graph-of-words,
	we are able to model these relationships and then determine how similar two
	documents are by using a modified shortest-path graph kernel. We evaluate our
	approach on two tasks and compare it against several baseline approaches using
	various performance metrics such as DET curves and macro-average F1-score.
	Experimental results on a range of datasets showed that our proposed approach
	outperforms traditional techniques and is capable of measuring more accurately
	the similarity between two documents.},
  url       = {https://www.aclweb.org/anthology/D17-1202}
}

@InProceedings{yang-boydgraber-resnik:2017:EMNLP2017,
  author    = {Yang, Weiwei  and  Boyd-Graber, Jordan  and  Resnik, Philip},
  title     = {Adapting Topic Models using Lexical Associations with Tree Priors},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1901--1906},
  abstract  = {Models work best when they are optimized taking into account the evaluation
	criteria that people care about. For topic models, people often care about
	interpretability, which can be approximated using measures of lexical
	association. We integrate lexical association into topic optimization using
	tree priors, which provide a flexible framework that can take advantage of both
	first order word associations and the higher-order associations captured by
	word embeddings. Tree priors improve topic interpretability without hurting
	extrinsic performance.},
  url       = {https://www.aclweb.org/anthology/D17-1203}
}

@InProceedings{parde-nielsen:2017:EMNLP2017,
  author    = {Parde, Natalie  and  Nielsen, Rodney},
  title     = {Finding Patterns in Noisy Crowds: Regression-based Annotation Aggregation for Crowdsourced Data},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1907--1912},
  abstract  = {Crowdsourcing offers a convenient means of obtaining labeled data quickly and
	inexpensively.              However, crowdsourced labels are often noisier than
	expert-annotated data, making it difficult to aggregate them meaningfully.  We
	present an aggregation approach that learns a regression model from
	crowdsourced annotations to predict aggregated labels for instances that have
	no expert adjudications.  The predicted labels achieve a correlation of 0.594
	with expert labels on our data, outperforming the best alternative aggregation
	method by 11.9%.  Our approach also outperforms the alternatives on third-party
	datasets.},
  url       = {https://www.aclweb.org/anthology/D17-1204}
}

@InProceedings{wang-EtAl:2017:EMNLP20175,
  author    = {Wang, Chenguang  and  Akbik, Alan  and  chiticariu, laura  and  Li, Yunyao  and  Xia, Fei  and  Xu, Anbang},
  title     = {CROWD-IN-THE-LOOP: A Hybrid Approach for Annotating Semantic Roles},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1913--1922},
  abstract  = {Crowdsourcing has proven to be an effective method for generating labeled data
	for a range of NLP tasks. However, multiple recent attempts of using
	crowdsourcing to generate gold-labeled training data for semantic role labeling
	(SRL) reported only modest results, indicating that SRL is perhaps too
	difficult a task to be effectively crowdsourced. In this paper, we postulate
	that while producing SRL annotation does require expert involvement in general,
	a large subset of SRL labeling tasks is in fact appropriate for the crowd. We
	present a novel workflow in which we employ a classifier to identify difficult
	annotation tasks and route each task either to experts or crowd workers
	according to their difficulties. Our experimental evaluation shows that the
	proposed approach reduces the workload for experts by over two-thirds, and thus
	significantly reduces the cost of producing SRL annotation at little loss in
	quality.},
  url       = {https://www.aclweb.org/anthology/D17-1205}
}

@InProceedings{hashimoto-EtAl:2017:EMNLP2017,
  author    = {Hashimoto, Kazuma  and  xiong, caiming  and  Tsuruoka, Yoshimasa  and  Socher, Richard},
  title     = {A Joint Many-Task Model: Growing a Neural Network for Multiple NLP Tasks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1923--1933},
  abstract  = {Transfer and multi-task learning have traditionally focused on either a single
	source-target pair or very few, similar tasks.
	Ideally, the linguistic levels of morphology, syntax and semantics would
	benefit each other by being trained in a single model.
	We introduce a joint many-task model together with a strategy for successively
	growing its depth to solve increasingly complex tasks.
	Higher layers include shortcut connections to lower-level task predictions to
	reflect linguistic hierarchies.
	We use a simple regularization term to allow for optimizing all model weights
	to improve one task's loss without exhibiting catastrophic interference of the
	other tasks.
	Our single end-to-end model obtains state-of-the-art or competitive results on
	five different tasks from tagging, parsing, relatedness, and entailment tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1206}
}

@InProceedings{zhang-EtAl:2017:EMNLP20175,
  author    = {Zhang, Meng  and  Liu, Yang  and  Luan, Huanbo  and  Sun, Maosong},
  title     = {Earth Mover's Distance Minimization for Unsupervised Bilingual Lexicon Induction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1934--1945},
  abstract  = {Cross-lingual natural language processing hinges on the premise that there
	exists invariance across languages. At the word level, researchers have
	identified such invariance in the word embedding semantic spaces of different
	languages. However, in order to connect the separate spaces, cross-lingual
	supervision encoded in parallel data is typically required. In this paper, we
	attempt to establish the cross-lingual connection without relying on any
	cross-lingual supervision. By viewing word embedding spaces as distributions,
	we propose to minimize their earth mover's distance, a measure of divergence
	between distributions. We demonstrate the success on the unsupervised bilingual
	lexicon induction task. In addition, we reveal an interesting finding that the
	earth mover's distance shows potential as a measure of language difference.
	Author{4}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1207}
}

@InProceedings{stahlberg-byrne:2017:EMNLP2017,
  author    = {Stahlberg, Felix  and  Byrne, Bill},
  title     = {Unfolding and Shrinking Neural Machine Translation Ensembles},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1946--1956},
  abstract  = {Ensembling is a well-known technique in neural machine translation (NMT) to
	improve system performance. Instead of a single neural net, multiple neural
	nets with the same topology are trained separately, and the decoder generates
	predictions by averaging over the individual models. Ensembling often improves
	the quality of the generated translations drastically. However, it is not
	suitable for production systems because it is cumbersome and slow. This work
	aims to reduce the runtime to be on par with a single system without
	compromising the translation quality. First, we show that the ensemble can be
	unfolded into a single large neural network which imitates the output of the
	ensemble system. We show that unfolding can already improve the runtime in
	practice since more work can be done on the GPU. We proceed by describing a set
	of techniques to shrink the unfolded network by reducing the dimensionality of
	layers. On Japanese-English we report that the resulting network has the size
	and decoding speed of a single NMT network but performs on the level of a
	3-ensemble system.},
  url       = {https://www.aclweb.org/anthology/D17-1208}
}

@InProceedings{bastings-EtAl:2017:EMNLP2017,
  author    = {Bastings, Joost  and  Titov, Ivan  and  Aziz, Wilker  and  Marcheggiani, Diego  and  Simaan, Khalil},
  title     = {Graph Convolutional Encoders for Syntax-aware Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1957--1967},
  abstract  = {We present a simple and effective approach to incorporating syntactic structure
	into neural attention-based encoder-decoder models for machine translation. We
	rely on graph-convolutional networks (GCNs), a recent class of neural networks
	developed for modeling graph-structured data. Our GCNs use predicted syntactic
	dependency trees of source sentences to produce representations of words (i.e.
	hidden states of the encoder) that are sensitive to their syntactic
	neighborhoods. GCNs take word representations as input and produce word
	representations as output, so they can easily be incorporated as layers into
	standard encoders (e.g., on top of bidirectional RNNs or convolutional neural
	networks). We evaluate their effectiveness with English-German and
	English-Czech translation experiments for different types of encoders and
	observe substantial improvements over their syntax-agnostic versions in all the
	considered setups.},
  url       = {https://www.aclweb.org/anthology/D17-1209}
}

@InProceedings{gu-cho-li:2017:EMNLP2017,
  author    = {Gu, Jiatao  and  Cho, Kyunghyun  and  Li, Victor O.K.},
  title     = {Trainable Greedy Decoding for Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1968--1978},
  abstract  = {Recent research in neural machine translation has largely focused on two
	aspects; neural network architectures and end-to-end learning algorithms. The
	problem of decoding, however, has received relatively little attention from the
	research community. In this paper, we solely focus on the problem of decoding
	given a trained neural machine translation model. Instead of trying to build a
	new decoding algorithm for any specific decoding objective, we propose the idea
	of trainable decoding algorithm in which we train a decoding algorithm to find
	a translation that maximizes an arbitrary decoding objective. More
	specifically, we design an actor that observes and manipulates the hidden state
	of the neural machine translation decoder and propose to train it using a
	variant of deterministic policy gradient. We extensively evaluate the proposed
	algorithm using four language pairs and two decoding objectives and show that
	we can indeed train a trainable greedy decoder that generates a better
	translation (in terms of a target decoding objective) with minimal
	computational overhead.},
  url       = {https://www.aclweb.org/anthology/D17-1210}
}

@InProceedings{yang-mukherjee-dragut:2017:EMNLP2017,
  author    = {Yang, Fan  and  Mukherjee, Arjun  and  Dragut, Eduard},
  title     = {Satirical News Detection and Analysis using Attention Mechanism and Linguistic Features},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1979--1989},
  abstract  = {Satirical news is considered to be entertainment, but it is potentially
	deceptive and harmful. Despite the embedded genre in the article, not everyone
	can recognize the satirical cues and therefore believe the news as true news.
	We observe that satirical cues are often reflected in certain paragraphs rather
	than the whole document. Existing works only consider document-level features
	to detect the satire, which could be limited. We consider paragraph-level
	linguistic features to unveil the satire by incorporating neural network and
	attention mechanism. We investigate the difference between paragraph-level
	features and document-level features, and analyze them on a large satirical
	news dataset. The evaluation shows that the proposed model detects satirical
	news effectively and reveals what features are important at which level.},
  url       = {https://www.aclweb.org/anthology/D17-1211}
}

@InProceedings{fetahu-markert-anand:2017:EMNLP2017,
  author    = {Fetahu, Besnik  and  Markert, Katja  and  Anand, Avishek},
  title     = {Fine Grained Citation Span for References in Wikipedia},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1990--1999},
  abstract  = {Verifiability is one of the core editing principles in Wikipedia, where editors
	are encouraged to provide  citations for the added content. For a Wikipedia
	article determining what content is covered by a citation or the citation span
	is not trivial, an important aspect for automated citation finding for
	uncovered content, or fact assessments.
	We address the problem of determining the citation span in Wikipedia articles.
	We approach this problem by classifying which textual fragments in an article
	are covered or hold true given a citation. We propose a sequence classification
	approach where for a paragraph and a citation, we determine the citation span
	at a fine-grained
	level.
	We provide a thorough experimental evaluation and compare our approach against
	baselines adopted from the scientific domain, where we show improvement for all
	evaluation metrics.},
  url       = {https://www.aclweb.org/anthology/D17-1212}
}

@InProceedings{yang-EtAl:2017:EMNLP20174,
  author    = {Yang, Diyi  and  Halfaker, Aaron  and  Kraut, Robert  and  Hovy, Eduard},
  title     = {Identifying Semantic Edit Intentions from Revisions in Wikipedia},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2000--2010},
  abstract  = {Most studies on human editing focus merely on syntactic revision operations,
	failing to capture the intentions behind revision changes, which are essential
	for facilitating the single and collaborative writing process. 
	In this work, we develop in collaboration with Wikipedia editors a 13-category
	taxonomy of the semantic intention behind edits in Wikipedia articles. Using
	labeled article edits, we build a computational classifier of intentions that
	achieved a micro-averaged F1 score of 0.621. We use this model to investigate
	edit intention effectiveness: how different types of edits predict the
	retention of newcomers and changes in the quality of articles, two key concerns
	for Wikipedia today. Our analysis shows that the types of edits that users make
	in their first session predict their subsequent survival as Wikipedia editors,
	and articles in different stages need different types of edits.},
  url       = {https://www.aclweb.org/anthology/D17-1213}
}

@InProceedings{hewlett-EtAl:2017:EMNLP2017,
  author    = {Hewlett, Daniel  and  Jones, Llion  and  Lacoste, Alexandre  and  gur, izzeddin},
  title     = {Accurate Supervised and Semi-Supervised Machine Reading for Long Documents},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2011--2020},
  abstract  = {We introduce a hierarchical architecture for machine reading capable of
	extracting precise information from long documents.
	The model divides the document into small, overlapping windows and encodes all
	windows in parallel with an RNN.
	It then attends over these window encodings, reducing them to a single
	encoding, which is decoded into an answer using a sequence decoder.
	This hierarchical approach allows the model to scale to longer documents
	without increasing the number of sequential steps.
	In a supervised setting, our model achieves state of the art accuracy of 76.8
	on the WikiReading dataset.
	We also evaluate the model in a semi-supervised setting by downsampling the
	WikiReading training set to create increasingly smaller amounts of supervision,
	while leaving the full unlabeled document corpus to train a sequence
	autoencoder on document windows.
	We evaluate models that can reuse autoencoder states and outputs without
	fine-tuning their weights, allowing for more efficient training and inference.},
  url       = {https://www.aclweb.org/anthology/D17-1214}
}

@InProceedings{jia-liang:2017:EMNLP2017,
  author    = {Jia, Robin  and  Liang, Percy},
  title     = {Adversarial Examples for Evaluating Reading Comprehension Systems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2021--2031},
  abstract  = {Standard accuracy metrics indicate that 
	reading comprehension systems are making rapid progress,
	but the extent to which these systems truly understand language remains
	unclear.
	To reward systems with real language understanding abilities,
	we propose an adversarial evaluation scheme for the Stanford
	Question Answering Dataset (SQuAD). 
	Our method tests whether systems can answer questions
	about paragraphs that contain adversarially inserted sentences,
	which are automatically generated to distract computer systems
	without changing the correct answer or misleading humans.
	In this adversarial setting,
	the accuracy of sixteen published models
	drops from an average of $75\%$ F1 score to $36\%$;
	when the adversary is allowed to add ungrammatical sequences of words,
	average accuracy on four models decreases further to $7\%$.
	We hope our insights will motivate
	the development of new models that
	understand language more precisely.},
  url       = {https://www.aclweb.org/anthology/D17-1215}
}

@InProceedings{lin-sun-han:2017:EMNLP2017,
  author    = {Lin, Hongyu  and  Sun, Le  and  Han, Xianpei},
  title     = {Reasoning with Heterogeneous Knowledge for Commonsense Machine Comprehension},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2032--2043},
  abstract  = {Reasoning with commonsense knowledge is critical for natural language
	understanding. Traditional methods for commonsense machine comprehension mostly
	only focus on one specific kind of knowledge, neglecting the fact that
	commonsense reasoning requires simultaneously considering different kinds of
	commonsense knowledge. In this paper, we propose a multi-knowledge reasoning
	method, which can exploit heterogeneous knowledge for commonsense machine
	comprehension. Specifically, we first mine different kinds of knowledge
	(including event narrative knowledge, entity semantic knowledge and sentiment
	coherent knowledge) and encode them as inference rules with costs. Then we
	propose a multi-knowledge reasoning model, which selects inference rules for a
	specific reasoning context using attention mechanism, and reasons by
	summarizing all valid inference rules. Experiments on RocStories show that our
	method outperforms traditional models significantly.},
  url       = {https://www.aclweb.org/anthology/D17-1216}
}

@InProceedings{yin-song-zhang:2017:EMNLP2017,
  author    = {Yin, Yichun  and  Song, Yangqiu  and  Zhang, Ming},
  title     = {Document-Level Multi-Aspect Sentiment Classification as Machine Comprehension},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2044--2054},
  abstract  = {Document-level multi-aspect sentiment classification is an important task for
	customer relation management. In this paper, we model the task as a machine
	comprehension problem where pseudo question-answer pairs are constructed by a
	small number of aspect-related keywords and aspect ratings. A hierarchical
	iterative attention model is introduced to build aspectspecific representations
	by frequent and repeated interactions between documents and aspect questions.
	We adopt a hierarchical architecture to represent both word level and sentence
	level information, and use the attention operations for aspect questions and
	documents alternatively with the multiple hop mechanism. Experimental results
	on the TripAdvisor and BeerAdvocate datasets show that our model outperforms
	classical baselines. We will release our code and data for the method
	replicability.},
  url       = {https://www.aclweb.org/anthology/D17-1217}
}

@InProceedings{daxenberger-EtAl:2017:EMNLP2017,
  author    = {Daxenberger, Johannes  and  Eger, Steffen  and  Habernal, Ivan  and  Stab, Christian  and  Gurevych, Iryna},
  title     = {What is the Essence of a Claim? Cross-Domain Claim Identification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2055--2066},
  abstract  = {Argument mining has become a popular research area in NLP. It typically
	includes the identification of argumentative components, e.g. claims, as the
	central component of an argument. We perform a qualitative analysis across six
	different datasets and show that these appear to conceptualize claims quite
	differently. To learn about the consequences of such different
	conceptualizations of claim for practical applications, we carried out
	extensive experiments using state-of-the-art feature-rich and deep learning
	systems, to identify claims in a cross-domain fashion. While the divergent
	conceptualization of claims in different datasets is indeed harmful to
	cross-domain classification, we show that there are shared properties on the
	lexical level as well as system configurations that can help to overcome these
	gaps.},
  url       = {https://www.aclweb.org/anthology/D17-1218}
}

@InProceedings{du-cardie:2017:EMNLP2017,
  author    = {Du, Xinya  and  Cardie, Claire},
  title     = {Identifying Where to Focus in Reading Comprehension for Neural Question Generation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2067--2073},
  abstract  = {A first step in the task of automatically generating questions for testing
	reading comprehension is to identify \emph{question-worthy} sentences, i.e.
	sentences in a text passage that humans find it worthwhile to ask questions
	about. We propose a hierarchical neural sentence-level sequence tagging model
	for this task, which existing approaches to question generation have ignored.
	The approach is fully data-driven — with no sophisticated NLP pipelines or
	any hand-crafted rules/features — and compares favorably to a number of
	baselines when evaluated on the SQuAD data set. When incorporated into an
	existing neural question generation system, the resulting end-to-end system
	achieves state-of-the-art performance for paragraph-level question generation
	for reading comprehension.},
  url       = {https://www.aclweb.org/anthology/D17-1219}
}

@InProceedings{sterckx-EtAl:2017:EMNLP2017,
  author    = {Sterckx, Lucas  and  Naradowsky, Jason  and  Byrne, Bill  and  Demeester, Thomas  and  Develder, Chris},
  title     = {Break it Down for Me: A Study in Automated Lyric Annotation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2074--2080},
  abstract  = {Comprehending lyrics, as found in songs and poems, can pose a challenge to
	human and machine readers alike.  This motivates the need for systems that can
	understand the ambiguity and jargon found in such creative texts, and provide
	commentary to aid readers in reaching the correct interpretation.
	We introduce the task of automated lyric annotation (ALA).  Like text
	simplification, a goal of ALA is to rephrase the original text in a more easily
	understandable manner. However, in ALA the system must often include additional
	information to clarify niche terminology and abstract concepts. To stimulate
	research on this task, we release a large collection of crowdsourced
	annotations for song lyrics. We analyze the performance of translation and
	retrieval models on this task, measuring performance with both automated and
	human evaluation. We find that each model captures a unique type of information
	important to the task.},
  url       = {https://www.aclweb.org/anthology/D17-1220}
}

@InProceedings{li-EtAl:2017:EMNLP20173,
  author    = {Li, Piji  and  Lam, Wai  and  Bing, Lidong  and  Guo, Weiwei  and  Li, Hang},
  title     = {Cascaded Attention based Unsupervised Information Distillation for Compressive Summarization},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2081--2090},
  abstract  = {When people recall and digest what they have read for writing summaries, the
	important content is more likely to attract their attention.
	  Inspired by this observation, we propose a cascaded attention based
	unsupervised
	  model to estimate the salience information from the text for compressive
	multi-document summarization.
	  The attention weights are learned automatically by an unsupervised data
	reconstruction framework which can capture the sentence salience.
	  By adding sparsity constraints on the number of output vectors, we can
	generate condensed information which can be treated as word salience.
	  Fine-grained and coarse-grained sentence compression strategies are
	incorporated to produce compressive summaries. 
	  Experiments on some benchmark data sets show that our framework achieves
	better results than the state-of-the-art methods.},
  url       = {https://www.aclweb.org/anthology/D17-1221}
}

@InProceedings{li-EtAl:2017:EMNLP20174,
  author    = {Li, Piji  and  Lam, Wai  and  Bing, Lidong  and  Wang, Zihao},
  title     = {Deep Recurrent Generative Decoder for Abstractive Text Summarization},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2091--2100},
  abstract  = {We propose a new framework for abstractive text summarization based on a
	sequence-to-sequence oriented encoder-decoder model equipped with a deep
	recurrent generative decoder (DRGN).
	  Latent structure information implied in the target summaries is learned based
	on a recurrent latent random model for improving the summarization quality.
	  Neural variational inference is employed to address the intractable posterior
	inference for the recurrent latent variables.
	  Abstractive summaries are generated based on both the generative latent
	variables and the discriminative deterministic states.
	  Extensive experiments on some benchmark datasets in different languages show
	that DRGN achieves improvements over the state-of-the-art methods.},
  url       = {https://www.aclweb.org/anthology/D17-1222}
}

@InProceedings{isonuma-EtAl:2017:EMNLP2017,
  author    = {Isonuma, Masaru  and  Fujino, Toru  and  Mori, Junichiro  and  Matsuo, Yutaka  and  Sakata, Ichiro},
  title     = {Extractive Summarization Using Multi-Task Learning with Document Classification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2101--2110},
  abstract  = {The need for automatic document summarization that can be used for practical
	applications is increasing rapidly. In this paper, we propose a general
	framework for summarization that extracts sentences from a document using
	externally related information. Our work is aimed at single document
	summarization using small amounts of reference summaries. In particular, we
	address document summarization in the framework of multi-task learning using
	curriculum learning for sentence extraction and document classification. The
	proposed framework enables us to obtain better feature representations to
	extract sentences from documents. We evaluate our proposed summarization method
	on two datasets: financial report and news corpus. Experimental results
	demonstrate that our summarizers achieve performance that is comparable to
	state-of-the-art systems.},
  url       = {https://www.aclweb.org/anthology/D17-1223}
}

@InProceedings{zhang-wan:2017:EMNLP2017,
  author    = {Zhang, Jianmin  and  Wan, Xiaojun},
  title     = {Towards Automatic Construction of News Overview Articles by News Synthesis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2111--2116},
  abstract  = {In this paper we investigate a new task of automatically constructing an
	overview article from a given set of news articles about a news event.
	We propose a news synthesis approach to address this task based on passage
	segmentation, ranking, selection and merging.
	Our proposed approach is compared with several typical multi-document
	summarization methods on the Wikinews dataset, and achieves the best
	performance on both automatic evaluation and manual evaluation.},
  url       = {https://www.aclweb.org/anthology/D17-1224}
}

@InProceedings{zhao-huang:2017:EMNLP2017,
  author    = {Zhao, Kai  and  Huang, Liang},
  title     = {Joint Syntacto-Discourse Parsing and the Syntacto-Discourse Treebank},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2117--2123},
  abstract  = {Discourse parsing has long been treated as a stand-alone problem independent
	from constituency or dependency parsing. Most attempts at this problem rely on
	annotated text segmentations (Elementary Discourse Units, EDUs) and
	sophisticated sparse or continuous features to extract syntactic information.
	In this paper we propose the first end-to-end discourse parser that jointly
	parses in both syntax and discourse levels, as well as the first
	syntacto-discourse treebank by integrating the Penn Treebank and the RST
	Treebank. Built upon our recent span-based constituency parser, this joint
	syntacto-discourse parser requires no preprocessing efforts such as
	segmentation or feature extraction, making discourse parsing more convenient.
	Empirically, our parser achieves the state-of-the-art end-to-end discourse
	parsing accuracy.},
  url       = {https://www.aclweb.org/anthology/D17-1225}
}

@InProceedings{choubey-huang:2017:EMNLP20172,
  author    = {Choubey, Prafulla Kumar  and  Huang, Ruihong},
  title     = {Event Coreference Resolution by Iteratively Unfolding Inter-dependencies among Events},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2124--2133},
  abstract  = {We introduce a novel iterative approach for event coreference resolution that
	gradually builds event clusters by exploiting inter-dependencies among event
	mentions within the same chain as well as across event chains. Among event
	mentions in the same chain, we distinguish within- and cross-document event
	coreference links by using two distinct pairwise classifiers, trained
	separately to capture differences in feature distributions of within- and
	cross-document event clusters. Our event coreference approach alternates
	between WD and CD clustering and combines arguments from both event clusters
	after every merge, continuing till no more merge can be made. And then it
	performs further merging between event chains that are both closely related to
	a set of other chains of events. Experiments on the ECB+ corpus show that our
	model outperforms state-of-the-art methods in joint task of WD and CD event
	coreference resolution.},
  url       = {https://www.aclweb.org/anthology/D17-1226}
}

@InProceedings{huang-zhao-ma:2017:EMNLP2017,
  author    = {Huang, Liang  and  Zhao, Kai  and  Ma, Mingbo},
  title     = {When to Finish? Optimal Beam Search for Neural Text Generation (modulo beam size)},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2134--2139},
  abstract  = {In neural text generation such as neural
	machine translation, summarization, and
	image captioning, beam search is widely
	used to improve the output text quality.
	However, in the neural generation set-
	ting, hypotheses can finish in different
	steps, which makes it difficult to decide
	when to end beam search to ensure op-
	timality. We propose a provably optimal
	beam search algorithm that will always re-
	turn the optimal-score complete hypothe-
	sis (modulo beam size), and finish as soon
	as the optimality is established. To counter
	neural generation’s tendency for shorter
	hypotheses, we also introduce a bounded
	length reward mechanism which allows a
	modified version of our beam search al-
	gorithm to remain optimal. Experiments
	on neural machine translation demonstrate
	that our principled beam search algorithm
	leads to improvement in BLEU score over
	previously proposed alternatives.},
  url       = {https://www.aclweb.org/anthology/D17-1227}
}

@InProceedings{wang-EtAl:2017:EMNLP20176,
  author    = {Wang, Di  and  Jojic, Nebojsa  and  Brockett, Chris  and  Nyberg, Eric},
  title     = {Steering Output Style and Topic in Neural Response Generation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2140--2150},
  abstract  = {We propose simple and flexible training and decoding methods for influencing
	output style and topic in neural encoder-decoder based language generation.
	This capability is desirable in a variety of applications, including
	conversational systems, where successful agents need to produce language in a
	specific style and generate responses steered by a human puppeteer or external
	knowledge. We decompose the neural generation process into empirically easier
	sub-problems: a faithfulness model and a decoding method based on
	selective-sampling. We also describe training and sampling algorithms that bias
	the generation process with a specific language style restriction, or a topic
	restriction. Human evaluation results show that our proposed methods are able
	to to restrict style and topic without degrading output quality in
	conversational tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1228}
}

@InProceedings{tran-zukerman-haffari:2017:EMNLP2017,
  author    = {Tran, Quan Hung  and  Zukerman, Ingrid  and  Haffari, Gholamreza},
  title     = {Preserving Distributional Information in Dialogue Act Classification},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2151--2156},
  abstract  = {This paper introduces a novel training/decoding strategy for sequence labeling.
	Instead of greedily choosing a label at each time step, and using it for the
	next prediction, we retain the probability distribution over the current label,
	and pass this distribution to the next prediction. This approach allows us to
	avoid the effect of label bias and error propagation in sequence
	learning/decoding. Our experiments on dialogue act classification demonstrate
	the effectiveness of this approach. Even though our underlying neural network
	model is relatively simple, it outperforms more complex neural models,
	achieving state-of-the-art results on the MapTask and Switchboard corpora.},
  url       = {https://www.aclweb.org/anthology/D17-1229}
}

@InProceedings{li-EtAl:2017:EMNLP20175,
  author    = {Li, Jiwei  and  Monroe, Will  and  Shi, Tianlin  and  Jean, S\'ebastien  and  Ritter, Alan  and  Jurafsky, Dan},
  title     = {Adversarial Learning for Neural Dialogue Generation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2157--2169},
  abstract  = {We apply adversarial training to open-domain dialogue generation,
	training a system to produce sequences that are
	 indistinguishable from human-generated dialogue utterances. 
	We cast the task as a reinforcement learning problem where we jointly train two
	systems: a generative model to produce response sequences, and a
	discriminator---analagous to the human evaluator in the Turing test--- to
	distinguish between 
	 the 
	 human-generated dialogues and the machine-generated ones. 
	In this generative adversarial network approach,
	the outputs from the discriminator are 
	used to encourage the system towards more human-like dialogue.
	Further, we investigate models
	for adversarial  evaluation that 
	uses success in fooling an adversary as a dialogue evaluation metric,
	while avoiding a number of potential pitfalls.
	Experimental results on several
	metrics, including adversarial evaluation, demonstrate
	that the adversarially-trained system generates higher-quality responses
	than previous baselines},
  url       = {https://www.aclweb.org/anthology/D17-1230}
}

@InProceedings{liu-EtAl:2017:EMNLP20176,
  author    = {Liu, Yang  and  Han, Kun  and  Tan, Zhao  and  Lei, Yun},
  title     = {Using Context Information for Dialog Act Classification in DNN Framework},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2170--2178},
  abstract  = {Previous work on dialog act (DA) classification has investigated different
	methods, such as hidden Markov models, maximum entropy, conditional random
	fields, graphical models, and support vector machines.
	A few recent studies explored using deep learning neural networks for DA
	classification, however, it is not clear yet what is the best method for using
	dialog context or DA sequential information, and how much gain it brings. This
	paper proposes several ways of using context information for DA classification,
	all in the deep learning framework. The baseline system classifies each
	utterance using the convolutional neural networks (CNN). Our proposed methods
	include using hierarchical models (recurrent neural networks (RNN) or CNN) for
	DA sequence tagging where the bottom layer takes the sentence CNN
	representation as input, concatenating predictions from the previous utterances
	with the CNN vector for classification, and performing sequence decoding based
	on the predictions from the sentence CNN model. 
	We conduct thorough experiments and comparisons on the Switchboard corpus,
	demonstrate that incorporating context information significantly improves DA
	classification, and show that we achieve new state-of-the-art performance for
	this task.},
  url       = {https://www.aclweb.org/anthology/D17-1231}
}

@InProceedings{jo-EtAl:2017:EMNLP2017,
  author    = {Jo, Yohan  and  Yoder, Michael  and  Jang, Hyeju  and  Rose, Carolyn},
  title     = {Modeling Dialogue Acts with Content Word Filtering and Speaker Preferences},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2179--2189},
  abstract  = {We present an unsupervised model of dialogue act sequences in conversation. By
	modeling topical themes as transitioning more slowly than dialogue acts in
	conversation, our model de-emphasizes content-related words in order to focus
	on conversational function words that signal dialogue acts. We also incorporate
	speaker tendencies to use some acts more than others as an additional predictor
	of dialogue act prevalence beyond temporal dependencies. According to the
	evaluation presented on two dissimilar corpora, the CNET forum and NPS Chat
	corpus, the effectiveness of each modeling assumption is found to vary
	depending on characteristics of the data. De-emphasizing content-related words
	yields improvement on the CNET corpus, while utilizing speaker tendencies is
	advantageous on the NPS corpus. The components of our model complement one
	another to achieve robust performance on both corpora and outperform
	state-of-the-art baseline models.},
  url       = {https://www.aclweb.org/anthology/D17-1232}
}

@InProceedings{yao-EtAl:2017:EMNLP2017,
  author    = {Yao, Lili  and  Zhang, Yaoyuan  and  Feng, Yansong  and  Zhao, Dongyan  and  Yan, Rui},
  title     = {Towards Implicit Content-Introducing for Generative Short-Text Conversation Systems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2190--2199},
  abstract  = {The study on human-computer conversation systems is a hot research topic
	nowadays. One of the prevailing methods to build the system is using the
	generative Sequence-to-Sequence (Seq2Seq) model through neural networks.
	However, the standard Seq2Seq model is prone to generate trivial responses. In
	this paper, we aim to generate a more meaningful and informative reply when
	answering a given question. We propose an implicit content-introducing method
	which incorporates additional information into the Seq2Seq model in a flexible
	way. Specifically, we fuse the general decoding and the auxiliary cue word
	information through our proposed hierarchical gated fusion unit. Experiments on
	real-life data demonstrate that our model consistently outperforms a set of
	competitive baselines in terms of BLEU scores and human evaluation.},
  url       = {https://www.aclweb.org/anthology/D17-1233}
}

@InProceedings{chang-EtAl:2017:EMNLP2017,
  author    = {Chang, Cheng  and  Yang, Runzhe  and  Chen, Lu  and  Zhou, Xiang  and  Yu, Kai},
  title     = {Affordable On-line Dialogue Policy Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2200--2209},
  abstract  = {The key to building an evolvable dialogue system in real-world scenarios is to
	ensure an affordable on-line dialogue policy learning, which requires the
	on-line learning process to be safe, efficient and economical. But in reality,
	due to the scarcity of real interaction data, the dialogue system usually grows
	slowly. Besides, the poor initial dialogue policy easily leads to bad user
	experience and incurs a failure of attracting users to contribute training
	data, so that the learning process is unsustainable. To accurately depict this,
	 two quantitative metrics are proposed to assess safety and efficiency issues.
	For solving the unsustainable learning problem, we proposed a complete
	companion teaching framework incorporating the guidance from the human teacher.
	Since the human teaching is expensive, we compared various teaching schemes
	answering the question how and when to teach, to economically utilize teaching
	budget, so that make the online learning process affordable.},
  url       = {https://www.aclweb.org/anthology/D17-1234}
}

@InProceedings{shao-EtAl:2017:EMNLP2017,
  author    = {Shao, Yuanlong  and  Gouws, Stephan  and  Britz, Denny  and  Goldie, Anna  and  Strope, Brian  and  Kurzweil, Ray},
  title     = {Generating High-Quality and Informative Conversation Responses with Sequence-to-Sequence Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2210--2219},
  abstract  = {Sequence-to-sequence models have been applied to the conversation response
	generation problem where the source sequence is the conversation history and
	the target sequence is the response. Unlike translation, conversation
	responding is inherently creative. The generation of long, informative,
	coherent, and diverse responses remains a hard task.
	In this work, we focus on the single turn setting. We add self-attention to the
	decoder to maintain coherence in longer responses, and we propose a practical
	approach, called the glimpse-model, for scaling to large datasets. We introduce
	a stochastic beam-search algorithm with segment-by-segment reranking which lets
	us inject diversity earlier in the generation process. We trained on a combined
	data set of over 2.3B conversation messages mined from the web. In human
	evaluation studies, our method produces longer responses overall, with a higher
	proportion rated as acceptable and excellent as length increases, compared to
	baseline sequence-to-sequence models with explicit length-promotion. A back-off
	strategy produces better responses overall, in the full spectrum of lengths.},
  url       = {https://www.aclweb.org/anthology/D17-1235}
}

@InProceedings{eshghi-shalyminov-lemon:2017:EMNLP2017,
  author    = {Eshghi, Arash  and  Shalyminov, Igor  and  Lemon, Oliver},
  title     = {Bootstrapping incremental dialogue systems from minimal data: the generalisation power of dialogue grammars},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2220--2230},
  abstract  = {We investigate an end-to-end method for automatically inducing task-based
	dialogue systems from small amounts  of unannotated dialogue data. It combines
	an incremental semantic grammar  - Dynamic Syntax and Type Theory with Records
	(DS-TTR) - with Reinforcement Learning (RL), where language generation and
	dialogue management are a joint  decision problem. The systems thus produced
	are incremental: dialogues are processed word-by-word, shown previously to be
	essential in supporting natural, spontaneous dialogue. We hypothesised that the
	rich linguistic knowledge within the grammar should enable a combinatorially
	large number of dialogue variations to be processed, even when trained on very
	few dialogues. Our experiments show that our model can process 74% of the
	Facebook AI bAbI dataset even when trained on only 0.13% of the data (5
	dialogues).  It can in addition process 65% of bAbI+, a corpus we created by
	systematically adding incremental dialogue phenomena such as restarts and
	self-corrections to bAbI. We compare our model with a state-of-the-art
	retrieval model, MEMN2N. We find that, in terms of semantic accuracy, the
	MEMN2N model shows very poor robustness to the bAbI+ transformations even when
	trained on the full bAbI dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1236}
}

@InProceedings{peng-EtAl:2017:EMNLP2017,
  author    = {Peng, Baolin  and  Li, Xiujun  and  Li, Lihong  and  Gao, Jianfeng  and  Celikyilmaz, Asli  and  Lee, Sungjin  and  Wong, Kam-Fai},
  title     = {Composite Task-Completion Dialogue Policy Learning via Hierarchical Deep Reinforcement Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2231--2240},
  abstract  = {Building a dialogue agent to fulfill complex tasks, such as travel planning, is
	challenging because the agent has to learn to collectively complete multiple
	subtasks. For example, the agent needs to reserve a hotel and book a flight so
	that there leaves enough time for commute between arrival and hotel check-in.
	This paper addresses this challenge by formulating the task in the mathematical
	framework of options over Markov Decision Processes (MDPs), and proposing a
	hierarchical deep reinforcement learning approach to learning a dialogue
	manager that operates at different temporal scales. The dialogue manager
	consists of: (1) a top-level dialogue policy that selects among subtasks or
	options, (2) a low-level dialogue policy that selects primitive actions to
	complete the subtask given by the top-level policy, and (3) a global state
	tracker that helps ensure all cross-subtask constraints be satisfied.
	Experiments on a travel planning task with simulated and real users show that
	our approach leads to significant improvements over three baselines, two based
	on handcrafted rules and the other based on flat deep reinforcement learning.},
  url       = {https://www.aclweb.org/anthology/D17-1237}
}

@InProceedings{novikova-EtAl:2017:EMNLP2017,
  author    = {Novikova, Jekaterina  and  Du\v{s}ek, Ond\v{r}ej  and  Cercas Curry, Amanda  and  Rieser, Verena},
  title     = {Why We Need New Evaluation Metrics for NLG},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2241--2252},
  abstract  = {The majority of NLG evaluation relies on automatic metrics, such as BLEU . In
	this paper, we motivate the need for novel, system- and data-independent
	automatic evaluation methods: We investigate a wide range of metrics, including
	state-of-the-art word-based and novel grammar-based ones, and demonstrate that
	they only weakly reflect human judgements of system outputs as generated by
	data-driven, end-to-end NLG. We also show that metric performance is data- and
	system-specific. Nevertheless, our results also suggest that automatic metrics
	perform reliably at system-level and can support system development by finding
	cases where a system performs poorly.},
  url       = {https://www.aclweb.org/anthology/D17-1238}
}

@InProceedings{wiseman-shieber-rush:2017:EMNLP2017,
  author    = {Wiseman, Sam  and  Shieber, Stuart  and  Rush, Alexander},
  title     = {Challenges in Data-to-Document Generation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2253--2263},
  abstract  = {Recent neural models have shown significant progress on the problem of
	generating short descriptive texts conditioned on a small number of database
	records. In this work, we suggest a slightly more difficult data-to-text
	generation task, and investigate how effective current approaches are on this
	task. In particular, we introduce a new, large-scale corpus of data records
	paired with descriptive documents, propose a series of extractive evaluation
	methods for analyzing performance, and obtain baseline results using current
	neural generation methods. Experiments show that these models produce fluent
	text, but fail to convincingly approximate human-generated documents. Moreover,
	even templated baselines exceed the performance of these neural models on some
	metrics, though copy- and reconstruction-based extensions lead to noticeable
	improvements.
	Author{2}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1239}
}

@InProceedings{patro-EtAl:2017:EMNLP2017,
  author    = {Patro, Jasabanta  and  Samanta, Bidisha  and  Singh, Saurabh  and  Basu, Abhipsa  and  Mukherjee, Prithwish  and  Choudhury, Monojit  and  Mukherjee, Animesh},
  title     = {All that is English may be Hindi: Enhancing language identification through automatic ranking of the likeliness of word borrowing in social media},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2264--2274},
  abstract  = {n this paper, we present a set of computational methods to identify the
	likeliness of a word being borrowed, based on the signals from social media. In
	terms of Spearman’s correlation values, our methods perform more than two
	times better (∼ 0.62) in predicting the borrowing likeliness compared to the
	best performing baseline (∼ 0.26) reported in literature. Based on this
	likeliness estimate we asked annotators to re-annotate the language tags of
	foreign words in predominantly native contexts. In 88% of cases the annotators
	felt that the foreign language tag should be replaced by native language tag,
	thus indicating a huge scope for improvement of automatic language
	identification systems.},
  url       = {https://www.aclweb.org/anthology/D17-1240}
}

@InProceedings{ding-bickel-pan:2017:EMNLP2017,
  author    = {Ding, Tao  and  Bickel, Warren K.  and  Pan, Shimei},
  title     = {Multi-View Unsupervised User Feature Embedding for Social Media-based Substance Use Prediction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2275--2284},
  abstract  = {In this paper, we demonstrate how the state-of-the-art machine learning and
	text mining techniques can be used to build effective social media-based
	substance use detection systems.  Since a substance use ground truth is
	difficult to obtain on a large scale, to maximize system performance, we
	explore different unsupervised feature learning methods to take advantage of a
	large amount of unsupervised social media data. We also demonstrate the benefit
	of using multi-view unsupervised feature learning to combine heterogeneous user
	information such as Facebook "likes" and  "status updates"  to enhance system
	performance.  Based on our evaluation, our best models achieved 86% AUC for
	predicting tobacco use,  81% for alcohol use and 84% for illicit drug use, all
	of which significantly outperformed existing methods. Our investigation has
	also uncovered interesting relations between a user's social media behavior
	(e.g., word usage) and substance use.},
  url       = {https://www.aclweb.org/anthology/D17-1241}
}

@InProceedings{garimella-banea-mihalcea:2017:EMNLP2017,
  author    = {Garimella, Aparna  and  Banea, Carmen  and  Mihalcea, Rada},
  title     = {Demographic-aware word associations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2285--2295},
  abstract  = {Variations of word associations across different groups of people can provide
	insights into people’s psychologies and their world views. To capture these
	variations, we introduce the task of demographic-aware word associations. We
	build a new gold standard dataset consisting of word association responses for
	approximately 300 stimulus words, collected from more than 800 respondents of
	different gender (male/female) and from different locations (India/United
	States), and show that there are significant variations in the word
	associations made by these groups. We also introduce a new demographic-aware
	word association model based on a neural net skip-gram architecture, and show
	how computational methods for measuring word associations that specifically
	account for writer demographics can outperform generic methods that are
	agnostic to such information.},
  url       = {https://www.aclweb.org/anthology/D17-1242}
}

@InProceedings{cheng-fang-ostendorf:2017:EMNLP2017,
  author    = {Cheng, Hao  and  Fang, Hao  and  Ostendorf, Mari},
  title     = {A Factored Neural Network Model for Characterizing Online Discussions in Vector Space},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2296--2306},
  abstract  = {We develop a novel factored neural model that learns comment embeddings in an
	unsupervised way leveraging the structure of distributional context in online
	discussion forums. The model links different context with related language
	factors in the embedding space, providing a way to interpret the factored
	embeddings. Evaluated on a community endorsement prediction task using a large
	collection of topic-varying Reddit discussions, the factored embeddings
	consistently achieve improvement over other text representations. Qualitative
	analysis shows that the model captures community style and topic, as well as
	response trigger patterns.},
  url       = {https://www.aclweb.org/anthology/D17-1243}
}

@InProceedings{rashid-blanco:2017:EMNLP2017,
  author    = {Rashid, Farzana  and  Blanco, Eduardo},
  title     = {Dimensions of Interpersonal Relationships: Corpus and Experiments},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2307--2316},
  abstract  = {This paper presents a corpus and experiments to determine dimensions of
	interpersonal relationships. We define a set of dimensions heavily inspired by
	work in social science. We create a corpus by retrieving pairs of people, and
	then annotating dimensions for their relationships. A corpus analysis shows
	that dimensions can be annotated reliably. Experimental results show that given
	a pair of people, values to dimensions can be assigned automatically.},
  url       = {https://www.aclweb.org/anthology/D17-1244}
}

@InProceedings{dusmanu-cabrio-villata:2017:EMNLP2017,
  author    = {Dusmanu, Mihai  and  Cabrio, Elena  and  Villata, Serena},
  title     = {Argument Mining on Twitter: Arguments, Facts and Sources},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2317--2322},
  abstract  = {Social media collect and spread on the Web personal opinions, facts, fake news
	and all kind of information users may be interested in. Applying argument
	mining methods to such heterogeneous data sources is a challenging open
	research issue, in particular considering the peculiarities of the language
	used to write textual messages on social media. In addition, new issues emerge
	when dealing with arguments posted on such platforms, such as the need to make
	a distinction between personal opinions and actual facts, and to detect the
	source disseminating information about such facts to allow for provenance
	verification. In this paper, we apply supervised classification to identify
	arguments on Twitter, and we present two new tasks for argument mining, namely
	facts recognition and source identification. We study the feasibility of the
	approaches proposed to address these tasks on a set of tweets related to the
	Grexit and Brexit news topics.},
  url       = {https://www.aclweb.org/anthology/D17-1245}
}

@InProceedings{aoki-EtAl:2017:EMNLP2017,
  author    = {Aoki, Tatsuya  and  Sasano, Ryohei  and  Takamura, Hiroya  and  Okumura, Manabu},
  title     = {Distinguishing Japanese Non-standard Usages from Standard Ones},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2323--2328},
  abstract  = {We focus on non-standard usages of common words on social media. In the context
	of social media, words sometimes have other usages that are totally different
	from their original. In this study, we attempt to distinguish non-standard
	usages on social media from standard ones in an unsupervised manner. Our basic
	idea is that non-standardness can be measured by the inconsistency between the
	expected meaning of the target word and the given context. For this purpose, we
	use context embeddings derived from word embeddings. Our experimental results
	show that the model leveraging the context embedding outperforms other methods
	and provide us with findings, for example, on how to construct context
	embeddings and which corpus to use.},
  url       = {https://www.aclweb.org/anthology/D17-1246}
}

@InProceedings{sap-EtAl:2017:EMNLP2017,
  author    = {Sap, Maarten  and  Prasettio, Marcella Cindy  and  Holtzman, Ari  and  Rashkin, Hannah  and  Choi, Yejin},
  title     = {Connotation Frames of Power and Agency in Modern Films},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2329--2334},
  abstract  = {The framing of an action influences how we perceive its actor. We introduce
	connotation frames of power and agency, a pragmatic formalism organized using
	frame semantic representations, to model how different levels of power and
	agency are implicitly projected on actors through their actions. We use the new
	power and agency frames to measure the subtle, but prevalent, gender bias in
	the portrayal of modern film characters and provide insights that deviate from
	the well-known Bechdel test. Our contributions include an extended lexicon of
	connotation frames along with a web interface that provides a comprehensive
	analysis through the lens of connotation frames.},
  url       = {https://www.aclweb.org/anthology/D17-1247}
}

@InProceedings{preoiucpietro-chandraguntuku-ungar:2017:EMNLP2017,
  author    = {Preoţiuc-Pietro, Daniel  and  Chandra Guntuku, Sharath  and  Ungar, Lyle},
  title     = {Controlling Human Perception of Basic User Traits},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2335--2341},
  abstract  = {Much of our online communication is text-mediated and, lately, more common with
	automated agents. Unlike interacting with humans, these agents currently do not
	tailor their language to the type of person they are communicating to. In this
	pilot study, we measure the extent to which human perception of basic user
	trait information -- gender and age -- is controllable through text. Using
	automatic models of gender and age prediction, we estimate which tweets posted
	by a user are more likely to mis-characterize his traits. We perform multiple
	controlled crowdsourcing experiments in which we show that we can reduce the
	human prediction accuracy of gender to almost random -- an over 20\% drop in
	accuracy. Our experiments show that it is practically feasible for multiple
	applications such as text generation, text summarization or machine translation
	to be tailored to specific traits and perceived as such.},
  url       = {https://www.aclweb.org/anthology/D17-1248}
}

@InProceedings{gautrais-EtAl:2017:EMNLP2017,
  author    = {Gautrais, Cl\'{e}ment  and  Cellier, Peggy  and  Quiniou, Ren\'{e}  and  Termier, Alexandre},
  title     = {Topic Signatures in Political Campaign Speeches},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2342--2347},
  abstract  = {Highlighting the recurrence of topics usage in candidates speeches is a key
	feature to identify the main ideas of each candidate during a political
	campaign. In this paper, we present a method combining standard topic modeling
	with signature mining for analyzing topic recurrence in speeches of Clinton and
	Trump during the 2016 American presidential campaign. The results show that the
	method extracts automatically the main ideas of each candidate and, in
	addition, provides information about the evolution of these topics during the
	campaign.},
  url       = {https://www.aclweb.org/anthology/D17-1249}
}

@InProceedings{schwartz-EtAl:2017:EMNLP2017,
  author    = {Schwartz, H. Andrew  and  Rouhizadeh, Masoud  and  Bishop, Michael  and  Tetlock, Philip  and  Mellers, Barbara  and  Ungar, Lyle},
  title     = {Assessing Objective Recommendation Quality through Political Forecasting},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2348--2357},
  abstract  = {Recommendations are often rated for their subjective quality, but few
	researchers have studied comment quality in terms of objective utility. We
	explore
	recommendation quality assessment with respect to both subjective (i.e.
	users’ ratings) and
	objective (i.e., did it influence? did it improve decisions?) metrics in a
	massive online geopolitical forecasting system, ultimately comparing linguistic
	characteristics of each quality metric. Using a variety of features, we predict
	all types of quality with better accuracy than the simple yet strong baseline
	of comment length. Looking at the most predictive content illustrates rater
	biases; for example, forecasters are subjectively biased in favor of comments
	mentioning business transactions or dealings as well as material things, even
	though such comments do not indeed prove any more useful objectively.
	Additionally, more complex sentence constructions, as evidenced by subordinate
	conjunctions, are characteristic of comments leading to objective improvements
	in forecasting.},
  url       = {https://www.aclweb.org/anthology/D17-1250}
}

@InProceedings{shirakawa-hara-maekawa:2017:EMNLP2017,
  author    = {Shirakawa, Masumi  and  Hara, Takahiro  and  Maekawa, Takuya},
  title     = {Never Abandon Minorities: Exhaustive Extraction of Bursty Phrases on Microblogs Using Set Cover Problem},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2358--2367},
  abstract  = {We propose a language-independent data-driven method to exhaustively extract
	bursty phrases of arbitrary forms (e.g., phrases other than simple noun
	phrases) from microblogs. The burst (i.e., the rapid increase of the
	occurrence) of a phrase causes the burst of overlapping N-grams including
	incomplete ones. In other words, bursty incomplete N-grams inevitably overlap
	bursty phrases. Thus, the proposed method performs the extraction of bursty
	phrases as the set cover problem in which all bursty N-grams are covered by a
	minimum set of bursty phrases. Experimental results using Japanese Twitter data
	showed that the proposed method outperformed word-based, noun phrase-based, and
	segmentation-based methods both in terms of accuracy and coverage.},
  url       = {https://www.aclweb.org/anthology/D17-1251}
}

@InProceedings{peng-chang-yih:2017:EMNLP2017,
  author    = {Peng, Haoruo  and  Chang, Ming-Wei  and  Yih, Wen-tau},
  title     = {Maximum Margin Reward Networks for Learning from Explicit and Implicit Supervision},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2368--2378},
  abstract  = {Neural networks have achieved state-of-the-art performance on several
	structured-output prediction tasks, trained in a fully supervised
	fashion.  However, annotated examples in structured domains are often
	costly to obtain, which thus limits the applications of neural
	networks.  In this work, we propose Maximum Margin Reward Networks, a
	neural network-based framework that aims to learn from both explicit
	(full structures) and implicit supervision signals (delayed feedback
	on the correctness of the predicted structure).  On named entity
	recognition and semantic parsing, our model outperforms previous
	systems on the benchmark datasets, CoNLL-2003 and WebQuestionsSP.},
  url       = {https://www.aclweb.org/anthology/D17-1252}
}

@InProceedings{wachsmuth-EtAl:2017:EMNLP2017,
  author    = {Wachsmuth, Henning  and  Da San Martino, Giovanni  and  Kiesel, Dora  and  Stein, Benno},
  title     = {The Impact of Modeling Overall Argumentation with Tree Kernels},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2379--2389},
  abstract  = {Several approaches have been proposed to model either the explicit sequential
	structure of an argumentative text or its implicit hierarchical structure. So
	far, the adequacy of these models of overall argumentation remains unclear.
	This paper asks what type of structure is actually important to tackle
	downstream tasks in computational argumentation. We analyze patterns in the
	overall argumentation of texts from three corpora. Then, we adapt the idea of
	positional tree kernels in order to capture sequential and hierarchical
	argumentative structure together for the first time. In systematic experiments
	for three text classification tasks, we find strong evidence for the impact of
	both types of structure. Our results suggest that either of them is necessary
	while their combination may be beneficial.},
  url       = {https://www.aclweb.org/anthology/D17-1253}
}

@InProceedings{gan-EtAl:2017:EMNLP2017,
  author    = {Gan, Zhe  and  Pu, Yunchen  and  Henao, Ricardo  and  Li, Chunyuan  and  He, Xiaodong  and  Carin, Lawrence},
  title     = {Learning Generic Sentence Representations Using Convolutional Neural Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2390--2400},
  abstract  = {We propose a new encoder-decoder approach to learn distributed sentence
	representations that are applicable to multiple purposes. The model is learned
	by using a convolutional neural network as an encoder to map an input sentence
	into a continuous vector, and using a long short-term memory recurrent neural
	network as a decoder. Several tasks are considered, including sentence
	reconstruction and future sentence prediction. Further, a hierarchical
	encoder-decoder model is proposed to encode a sentence to predict multiple
	future sentences. By training our models on a large collection of novels, we
	obtain a highly generic convolutional sentence encoder that performs well in
	practice. Experimental results on several benchmark datasets, and across a
	broad range of applications, demonstrate the superiority of the proposed model
	over competing methods.},
  url       = {https://www.aclweb.org/anthology/D17-1254}
}

@InProceedings{amiri-miller-savova:2017:EMNLP2017,
  author    = {Amiri, Hadi  and  Miller, Timothy  and  Savova, Guergana},
  title     = {Repeat before Forgetting: Spaced Repetition for Efficient and Effective Training of Neural Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2401--2410},
  abstract  = {We present a novel approach for training artificial neural networks. Our
	approach is inspired by broad evidence in psychology that shows human learners
	can learn efficiently and effectively by increasing intervals of time between
	subsequent reviews of previously learned materials (spaced repetition). We
	investigate the analogy between training neural models and findings in
	psychology about human memory model and develop an efficient and effective
	algorithm to train neural models. The core part of our algorithm is a
	cognitively-motivated scheduler according to which training instances and their
	"reviews" are spaced over time. Our algorithm uses only 34-50% of data per
	epoch, is 2.9-4.8 times faster than standard training, and outperforms
	competing state-of-the-art baselines. Our code is available at
	scholar.harvard.edu/hadi/RbF/.},
  url       = {https://www.aclweb.org/anthology/D17-1255}
}

@InProceedings{gui-EtAl:2017:EMNLP20172,
  author    = {Gui, Tao  and  Zhang, Qi  and  Huang, Haoran  and  Peng, Minlong  and  Huang, Xuanjing},
  title     = {Part-of-Speech Tagging for Twitter with Adversarial Neural Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2411--2420},
  abstract  = {In this work, we study the problem of part-of-speech tagging for Tweets. In
	contrast to newswire articles, Tweets are usually informal and contain numerous
	out-of-vocabulary words. Moreover, there is a lack of large scale labeled
	datasets for this domain. To tackle these challenges, we propose a novel neural
	network to make use of out-of-domain labeled data, unlabeled in-domain data,
	and labeled in-domain data.  Inspired by adversarial neural networks, the
	proposed method tries to learn common features through adversarial
	discriminator. In addition, we hypothesize that domain-specific features of
	target domain should be preserved in some degree. Hence, the proposed method
	adopts a sequence-to-sequence autoencoder to perform this task.  Experimental
	results on three different datasets  show that our method achieves better
	performance than state-of-the-art methods.},
  url       = {https://www.aclweb.org/anthology/D17-1256}
}

@InProceedings{li-EtAl:2017:EMNLP20176,
  author    = {Li, Bofang  and  Liu, Tao  and  Zhao, Zhe  and  Tang, Buzhou  and  Drozd, Aleksandr  and  Rogers, Anna  and  Du, Xiaoyong},
  title     = {Investigating Different Syntactic Context Types and Context Representations for Learning Word Embeddings},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2421--2431},
  abstract  = {The number of word embedding models is growing every year. Most of them are
	based on the co-occurrence information of words and their contexts. However, it
	is still an open question what is the best definition of context. We provide a
	systematical investigation of 4 different syntactic context types and context
	representations for learning word embeddings. Comprehensive experiments are
	conducted to evaluate their effectiveness on 6 extrinsic and intrinsic tasks.
	We hope that this paper, along with the published code, would be helpful for
	choosing the best context type and representation for a given task.},
  url       = {https://www.aclweb.org/anthology/D17-1257}
}

@InProceedings{braud-lacroix-sogaard:2017:EMNLP2017,
  author    = {Braud, Chlo\'{e}  and  Lacroix, Oph\'{e}lie  and  S{\o}gaard, Anders},
  title     = {Does syntax help discourse segmentation? Not so much},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2432--2442},
  abstract  = {Discourse segmentation is the first step in building discourse parsers. Most
	work on discourse segmentation does not scale to real-world discourse parsing
	across languages, for two reasons: (i) models rely on constituent trees, and
	(ii) experiments have relied on gold standard identification of sentence and
	token boundaries. We therefore investigate to what extent constituents can be
	replaced with universal dependencies, or left out completely, as well as how
	state-of-the-art segmenters fare in the absence of sentence boundaries. Our
	results show that dependency information is less useful than expected, but we
	provide a fully scalable, robust model that only relies on part-of-speech
	information, and show that it performs well across languages in the absence of
	any gold-standard annotation.},
  url       = {https://www.aclweb.org/anthology/D17-1258}
}

@InProceedings{lewis-EtAl:2017:EMNLP2017,
  author    = {Lewis, Mike  and  Yarats, Denis  and  Dauphin, Yann  and  Parikh, Devi  and  Batra, Dhruv},
  title     = {Deal or No Deal? End-to-End Learning of Negotiation Dialogues},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2443--2453},
  abstract  = {Much of human dialogue occurs in semi-cooperative settings, where agents with
	different goals attempt to agree on common decisions. Negotiations require
	complex communication and reasoning skills, but success is easy to measure,
	making this an interesting task for AI. We gather a large dataset of
	human-human negotiations on a multi-issue bargaining task, where agents who
	cannot observe each other’s reward functions must reach an agreement (or a
	deal) via natural language dialogue. For the first time, we show it is possible
	to train end-to-end models for negotiation, which must learn both linguistic
	and reasoning skills with no annotated dialogue states. We also introduce
	dialogue rollouts, in which the model plans ahead by simulating possible
	complete continuations of the conversation, and find that this technique
	dramatically improves performance. Our code and dataset are publicly available.},
  url       = {https://www.aclweb.org/anthology/D17-1259}
}

@InProceedings{chen-EtAl:2017:EMNLP20172,
  author    = {Chen, Lu  and  Zhou, Xiang  and  Chang, Cheng  and  Yang, Runzhe  and  Yu, Kai},
  title     = {Agent-Aware Dropout DQN for Safe and Efficient On-line Dialogue Policy Learning},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2454--2464},
  abstract  = {Hand-crafted rules and reinforcement learning (RL) are two popular choices to
	obtain dialogue policy.  The rule-based policy is often reliable within
	predefined scope but not self-adaptable, whereas RL is evolvable with data but
	often suffers from a bad initial performance. We employ a {\em companion
	learning} framework to integrate the two approaches for {\em on-line} dialogue
	policy learning, in which a pre-defined rule-based policy acts as a
	“teacher” and guides a data-driven RL system by giving example actions as
	well as additional rewards. A novel {\em agent-aware dropout} Deep Q-Network
	(AAD-DQN) is proposed to address the problem of when to consult the teacher and
	how to learn from the teacher's experiences. AAD-DQN, as a data-driven student
	policy, provides (1) two separate experience memories for student and teacher,
	(2) an uncertainty estimated by dropout to control the timing of consultation
	and learning. Simulation experiments showed that the proposed approach can
	significantly improve both {\em safety} and {\em efficiency} of on-line policy
	optimization compared to other companion learning approaches as well as
	supervised pre-training using static dialogue corpus.},
  url       = {https://www.aclweb.org/anthology/D17-1260}
}

@InProceedings{potash-rumshisky:2017:EMNLP2017,
  author    = {Potash, Peter  and  Rumshisky, Anna},
  title     = {Towards Debate Automation: a Recurrent Model for Predicting Debate Winners},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2465--2475},
  abstract  = {In this paper we introduce a practical first step towards the creation of an
	automated debate agent: a state-of-the-art recurrent predictive model for
	predicting debate winners. By having an accurate predictive model, we are able
	to objectively rate the quality of a statement made at a specific turn in a
	debate. The model is based on a recurrent neural network architecture with
	attention, which allows the model to effectively account for the entire debate
	when making its prediction. Our model achieves state-of-the-art accuracy on a
	dataset of debate transcripts annotated with audience favorability of the
	debate teams. Finally, we discuss how future work can leverage our proposed
	model for the creation of an automated debate agent. We accomplish this by
	determining the model input that will maximize audience favorability toward a
	given side of a debate at an arbitrary turn.},
  url       = {https://www.aclweb.org/anthology/D17-1261}
}

@InProceedings{ma-EtAl:2017:EMNLP2017,
  author    = {Ma, Qingsong  and  Graham, Yvette  and  Baldwin, Timothy  and  Liu, Qun},
  title     = {Further Investigation into Reference Bias in Monolingual Evaluation of Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2476--2485},
  abstract  = {Monolingual evaluation of Machine Translation (MT) aims to simplify human
	assessment by requiring assessors to compare the meaning of the MT output with
	a reference
	translation, opening up the task to a much larger pool of genuinely qualified
	evaluators. Monolingual evaluation runs the risk, however, of bias in favour of
	MT systems that happen to produce translations superficially similar to the
	reference and, consistent with this intuition, previous investigations have
	concluded monolingual assessment to be strongly biased in this respect. On
	re-examination of past analyses, we identify a series of potential analytical
	errors that force some important questions to be raised about the reliability
	of past conclusions, however. We subsequently carry out further investigation
	into reference bias via direct human assessment of MT adequacy via quality
	controlled crowd-sourcing. Contrary to both intuition and past conclusions,
	results for show no significant evidence of reference bias
	in monolingual evaluation of MT.},
  url       = {https://www.aclweb.org/anthology/D17-1262}
}

@InProceedings{isabelle-cherry-foster:2017:EMNLP2017,
  author    = {Isabelle, Pierre  and  Cherry, Colin  and  Foster, George},
  title     = {A Challenge Set Approach to Evaluating Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2486--2496},
  abstract  = {Neural machine translation represents an exciting leap forward in translation
	quality. But what longstanding weaknesses does it resolve, and which remain?
	We address these questions with a challenge set approach to translation
	evaluation and error analysis. A challenge set consists of a small set of
	sentences, each hand-designed to probe a system's capacity to bridge a
	particular structural divergence between languages.  To exemplify this
	approach, we present an English-French challenge set, and use it to analyze
	phrase-based and neural systems. The resulting analysis provides not only a
	more fine-grained picture of the strengths of neural systems, but also insight
	into which linguistic phenomena remain out of reach.},
  url       = {https://www.aclweb.org/anthology/D17-1263}
}

@InProceedings{nakashole-flauger:2017:EMNLP2017,
  author    = {Nakashole, Ndapandula  and  Flauger, Raphael},
  title     = {Knowledge Distillation for Bilingual Dictionary Induction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2497--2506},
  abstract  = {Leveraging zero-shot learning to learn
	mapping functions between vector spaces
	of different languages is a promising approach
	to bilingual dictionary induction.
	However, methods using this approach
	have not yet achieved high accuracy on the
	task. In this paper, we propose a bridging
	approach, where our main contribution
	is a knowledge distillation training objective.
	As teachers, rich resource translation
	paths are exploited in this role. And
	as learners, translation paths involving low
	resource languages learn from the teachers.
	Our training objective allows seamless
	addition of teacher translation paths
	for any given low resource pair. Since our
	approach relies on the quality of monolingual
	word embeddings, we also propose to
	enhance vector representations of both the
	source and target language with linguistic
	information. Our experiments on various
	languages show large performance gains
	from our distillation training objective, obtaining
	as high as 17% accuracy improvements.},
  url       = {https://www.aclweb.org/anthology/D17-1264}
}

@InProceedings{bawden:2017:EMNLP2017,
  author    = {Bawden, Rachel},
  title     = {Machine Translation, it's a question of style, innit? The case of English tag questions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2507--2512},
  abstract  = {In this paper, we address the problem of generating English tag questions (TQs)
	(e.g. it is, isn’t it?) in Machine Translation (MT). We propose a
	post-edition solution, formulating the problem as a multi-class classification
	task. We present (i) the automatic annotation of English TQs in a parallel
	corpus of subtitles and (ii) an approach using a series of classifiers to
	predict TQ forms, which we use to post-edit state-of-the-art MT outputs. Our
	method provides significant improvements in English TQ translation when
	translating from Czech, French and German, in turn improving the fluidity,
	naturalness, grammatical correctness and pragmatic coherence of MT output.},
  url       = {https://www.aclweb.org/anthology/D17-1265}
}

@InProceedings{pourdamghani-knight:2017:EMNLP2017,
  author    = {Pourdamghani, Nima  and  Knight, Kevin},
  title     = {Deciphering Related Languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2513--2518},
  abstract  = {We present a method for translating texts between close language pairs.  The
	method does not require parallel data, and it does not require the languages to
	be written in the same script.              We show results for six language pairs:
	Afrikaans/Dutch, Bosnian/Serbian, Danish/Swedish, Macedonian/Bulgarian,
	Malaysian/Indonesian, and Polish/Belorussian.  We report BLEU scores showing
	our method to outperform others that do not use parallel data.},
  url       = {https://www.aclweb.org/anthology/D17-1266}
}

@InProceedings{starnaud-beck-kondrak:2017:EMNLP2017,
  author    = {St Arnaud, Adam  and  Beck, David  and  Kondrak, Grzegorz},
  title     = {Identifying Cognate Sets Across Dictionaries of Related Languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2519--2528},
  abstract  = {We present a system for identifying cognate sets across dictionaries of related
	languages. The likelihood of a cognate relationship is calculated on the basis
	of a rich set of features that capture both phonetic and semantic similarity,
	as well as the presence of regular sound correspondences. The similarity scores
	are used to cluster words from different languages that may originate from a
	common proto-word. When tested on the Algonquian language family, our system
	detects 63% of cognate sets while maintaining cluster purity of 70%.},
  url       = {https://www.aclweb.org/anthology/D17-1267}
}

@InProceedings{malaviya-neubig-littell:2017:EMNLP2017,
  author    = {Malaviya, Chaitanya  and  Neubig, Graham  and  Littell, Patrick},
  title     = {Learning Language Representations for Typology Prediction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2529--2535},
  abstract  = {One central mystery of neural NLP is what neural models "know" about their
	subject matter. When a neural machine translation system learns to translate
	from one language to another, does it learn the syntax or semantics of the
	languages? Can this knowledge be extracted from the system to fill holes in
	human scientific knowledge? Existing typological databases contain relatively
	full feature specifications for only a few hundred languages. Exploiting the
	existence of parallel texts in more than a thousand languages, we build a
	massive many-to-one NMT system from 1017 languages into English, and use this
	to predict information missing from typological databases. Experiments show
	that the proposed method is able to infer not only syntactic, but also
	phonological and phonetic inventory features, and improves over a baseline that
	has access to information about the languages geographic and phylogenetic
	neighbors.},
  url       = {https://www.aclweb.org/anthology/D17-1268}
}

@InProceedings{mayhew-tsai-roth:2017:EMNLP2017,
  author    = {Mayhew, Stephen  and  Tsai, Chen-Tse  and  Roth, Dan},
  title     = {Cheap Translation for Cross-Lingual Named Entity Recognition},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2536--2545},
  abstract  = {Recent work in NLP has attempted to deal with low-resource languages but still
	assumed a resource level that is not present for most languages, e.g., the
	availability of Wikipedia in the target language. We propose a simple method
	for cross-lingual named entity recognition (NER) that works well in settings
	with {\em very} minimal resources. Our approach makes use of a lexicon to
	``translate" annotated data available in one or several high resource
	language(s) into the target language, and learns a standard monolingual NER
	model there. Further, when Wikipedia is available in the target language, our
	method can enhance Wikipedia based methods to yield state-of-the-art NER
	results; we evaluate on 7 diverse languages, improving the state-of-the-art by
	an average of 5.5\% F1 points. With the minimal resources required, this is an
	extremely portable cross-lingual NER approach, as illustrated using a truly
	low-resource language, Uyghur.},
  url       = {https://www.aclweb.org/anthology/D17-1269}
}

@InProceedings{vulic-mrkvsic-korhonen:2017:EMNLP2017,
  author    = {Vuli\'{c}, Ivan  and  Mrk\v{s}i\'{c}, Nikola  and  Korhonen, Anna},
  title     = {Cross-Lingual Induction and Transfer of Verb Classes Based on Word Vector Space Specialisation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2546--2558},
  abstract  = {Existing approaches to automatic VerbNet-style verb classification are heavily
	dependent on feature engineering and therefore limited to languages with mature
	NLP pipelines. In this work, we propose a novel cross-lingual transfer method
	for inducing VerbNets for multiple languages. To the best of our knowledge,
	this is the first study which demonstrates how the architectures for learning
	word embeddings can be applied to this challenging syntactic-semantic task. Our
	method uses cross-lingual translation pairs to tie each of the six target
	languages into a bilingual vector space with English, jointly specialising the
	representations to encode the relational information from English VerbNet. A
	standard clustering algorithm is then run on top of the VerbNet-specialised
	representations, using vector dimensions as features for learning verb classes.
	Our results show that the proposed cross-lingual transfer approach sets new
	state-of-the-art verb classification performance across all six target
	languages explored in this work.},
  url       = {https://www.aclweb.org/anthology/D17-1270}
}

@InProceedings{friedrich-gateva:2017:EMNLP2017,
  author    = {Friedrich, Annemarie  and  Gateva, Damyana},
  title     = {Classification of telicity using cross-linguistic annotation projection},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2559--2565},
  abstract  = {This paper addresses the automatic recognition of telicity, an aspectual
	notion. A telic event includes a natural endpoint ("she walked home"), while an
	atelic event does not ("she walked around"). Recognizing this difference is a
	prerequisite for temporal natural language understanding. In English, this
	classification task is difficult, as telicity is a covert linguistic category.
	In contrast, in Slavic languages, aspect is part of a verb's meaning and even
	available in machine-readable dictionaries. Our contributions are as follows.
	We successfully leverage additional silver standard training data in the form
	of projected annotations from parallel English-Czech data as well as context
	information, improving automatic telicity classification for English
	significantly compared to previous work. We also create a new data set of
	English texts manually annotated with telicity.},
  url       = {https://www.aclweb.org/anthology/D17-1271}
}

@InProceedings{lawrence-sokolov-riezler:2017:EMNLP2017,
  author    = {Lawrence, Carolin  and  Sokolov, Artem  and  Riezler, Stefan},
  title     = {Counterfactual Learning from Bandit Feedback under Deterministic Logging : A Case Study in Statistical Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2566--2576},
  abstract  = {The goal of counterfactual learning for statistical machine translation (SMT)
	is to optimize a target SMT system from logged data that consist of user
	feedback to translations that were predicted by another, historic SMT system. A
	challenge arises by the fact that risk-averse commercial SMT systems
	deterministically log the most probable translation. The lack of sufficient
	exploration of the SMT output space seemingly contradicts the theoretical
	requirements for counterfactual learning. We show that counterfactual learning
	from deterministic bandit logs is possible nevertheless by smoothing out
	deterministic components in learning. This can be achieved by additive and
	multiplicative control variates that avoid degenerate behavior in empirical
	risk minimization. Our simulation experiments show improvements of up to 2 BLEU
	points by counterfactual learning from deterministic bandit feedback.},
  url       = {https://www.aclweb.org/anthology/D17-1272}
}

@InProceedings{wang-EtAl:2017:EMNLP20177,
  author    = {Wang, Chengyu  and  Fan, Yan  and  He, Xiaofeng  and  Zhou, Aoying},
  title     = {Learning Fine-grained Relations from Chinese User Generated Categories},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2577--2587},
  abstract  = {User generated categories (UGCs) are short texts that reflect how people
	describe and organize entities, expressing rich semantic relations implicitly.
	While most methods on UGC relation extraction are based on pattern matching in
	English circumstances, learning relations from Chinese UGCs poses different
	challenges due to the flexibility of expressions. In this paper, we present a
	weakly supervised learning framework to harvest relations from Chinese UGCs. We
	identify is-a relations via word embedding based projection and inference,
	extract non-taxonomic relations and their category patterns by graph mining. We
	conduct experiments on Chinese Wikipedia and achieve high accuracy,
	outperforming state-of-the-art methods.},
  url       = {https://www.aclweb.org/anthology/D17-1273}
}

@InProceedings{huang-EtAl:2017:EMNLP20172,
  author    = {Huang, Lifu  and  Sil, Avirup  and  Ji, Heng  and  Florian, Radu},
  title     = {Improving Slot Filling Performance with Attentive Neural Networks on Dependency Structures},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2588--2597},
  abstract  = {Slot Filling (SF) aims to extract the values of certain types of attributes (or
	slots, such as person:cities\_of\_residence) for a given entity from a large
	collection of source documents. 
	In this paper we propose an effective DNN architecture for SF with the
	following new strategies: (1). Take a regularized dependency graph instead of a
	raw sentence as input to DNN, to compress the wide contexts between query and
	candidate filler; (2). Incorporate two attention mechanisms: local attention
	learned from query and candidate filler, and global attention learned from
	external knowledge bases, to guide the model to better select indicative
	contexts to determine slot type. Experiments show that this framework
	outperforms state-of-the-art on both relation extraction (16% absolute F-score
	gain) and slot filling validation for each individual system (up to 8.5%
	absolute F-score gain).},
  url       = {https://www.aclweb.org/anthology/D17-1274}
}

@InProceedings{durrett-EtAl:2017:EMNLP2017,
  author    = {Durrett, Greg  and  Kummerfeld, Jonathan K.  and  Berg-Kirkpatrick, Taylor  and  Portnoff, Rebecca  and  Afroz, Sadia  and  McCoy, Damon  and  Levchenko, Kirill  and  Paxson, Vern},
  title     = {Identifying Products in Online Cybercrime Marketplaces: A Dataset for Fine-grained Domain Adaptation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2598--2607},
  abstract  = {One weakness of machine-learned NLP models is that they typically perform
	poorly on out-of-domain data. In this work, we study the task of identifying
	products being bought and sold in online cybercrime forums, which exhibits
	particularly challenging cross-domain effects. We formulate a task that
	represents a hybrid of slot-filling information extraction and named entity
	recognition and annotate data from four different forums. Each of these forums
	constitutes its own "fine-grained domain" in that the forums cover different
	market sectors with different properties, even though all forums are in the
	broad domain of cybercrime. We characterize these domain differences in the
	context of a learning-based system: supervised models see decreased accuracy
	when applied to new forums, and standard techniques for semi-supervised
	learning and domain adaptation have limited effectiveness on this data, which
	suggests the need to improve these techniques. We release a dataset of 1,938
	annotated posts from across the four forums.},
  url       = {https://www.aclweb.org/anthology/D17-1275}
}

@InProceedings{muis-lu:2017:EMNLP2017,
  author    = {Muis, Aldrian Obaja  and  Lu, Wei},
  title     = {Labeling Gaps Between Words: Recognizing Overlapping Mentions with Mention Separators},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2608--2618},
  abstract  = {In this paper, we propose a new model that is capable of recognizing
	overlapping mentions. We introduce a novel notion of mention separators that
	can be effectively used to capture how mentions overlap with one another. On
	top of a novel multigraph representation that we introduce, we show that
	efficient and exact inference can still be performed. We present some
	theoretical analysis on the differences between our model and a recently
	proposed model for recognizing overlapping mentions, and discuss the possible
	implications of the differences. Through extensive empirical analysis on
	standard datasets, we demonstrate the effectiveness of our approach.},
  url       = {https://www.aclweb.org/anthology/D17-1276}
}

@InProceedings{ganea-hofmann:2017:EMNLP2017,
  author    = {Ganea, Octavian-Eugen  and  Hofmann, Thomas},
  title     = {Deep Joint Entity Disambiguation with Local Neural Attention},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2619--2629},
  abstract  = {We propose a novel deep learning model for joint document-level entity
	disambiguation, which leverages learned neural representations. Key components
	are entity embeddings, a neural attention mechanism over local context windows,
	and a differentiable joint inference stage for disambiguation. Our approach
	thereby combines benefits of deep learning with more traditional approaches
	such as graphical models and probabilistic mention-entity maps. Extensive
	experiments show that we are able to obtain competitive or state-of-the-art
	accuracy at moderate computational costs.},
  url       = {https://www.aclweb.org/anthology/D17-1277}
}

@InProceedings{gashteovski-gemulla-delcorro:2017:EMNLP2017,
  author    = {Gashteovski, Kiril  and  Gemulla, Rainer  and  Del Corro, Luciano},
  title     = {MinIE: Minimizing Facts in Open Information Extraction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2630--2640},
  abstract  = {The goal of Open Information Extraction (OIE) is to extract surface relations
	and their arguments from natural-language text in an unsupervised,
	domain-independent manner. In this paper, we propose MinIE, an OIE system that
	aims to provide useful, compact extractions with high precision and
	recall. MinIE approaches these goals by (1) representing information about
	polarity, modality, attribution, and quantities with semantic annotations
	instead of in the actual extraction, and (2) identifying and removing parts
	that are considered overly specific. We conducted an experimental study with
	several real-world datasets and found that MinIE achieves competitive or
	higher precision and recall than most prior systems, while at the same time
	producing shorter, semantically enriched extractions.},
  url       = {https://www.aclweb.org/anthology/D17-1278}
}

@InProceedings{luan-ostendorf-hajishirzi:2017:EMNLP2017,
  author    = {Luan, Yi  and  Ostendorf, Mari  and  Hajishirzi, Hannaneh},
  title     = {Scientific Information Extraction with Semi-supervised Neural Tagging},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2641--2651},
  abstract  = {This paper addresses the problem of extracting                                       
	     
	keyphrases
	from
	scientific
	articles and categorizing them as corresponding to a task, process, or
	material. We cast the problem as sequence tagging and introduce 
	semi-supervised methods to a neural tagging model, which builds on recent
	advances in named entity recognition. Since annotated training data is scarce
	in this domain, we introduce a graph-based semi-supervised algorithm together 
	with a data selection scheme to leverage unannotated articles. Both inductive
	and transductive semi-supervised learning strategies outperform
	state-of-the-art information extraction performance on the 2017 SemEval Task 10
	ScienceIE task.},
  url       = {https://www.aclweb.org/anthology/D17-1279}
}

@InProceedings{tang-EtAl:2017:EMNLP2017,
  author    = {Tang, Siliang  and  Zhang, Ning  and  Zhang, Jinjiang  and  Wu, Fei  and  Zhuang, Yueting},
  title     = {NITE: A Neural Inductive Teaching Framework for Domain Specific NER},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2652--2657},
  abstract  = {In domain-specific NER, due to insufficient labeled training data, deep models
	usually fail to behave normally. In this paper, we proposed a novel Neural
	Inductive TEaching framework (NITE) to transfer knowledge from existing
	domain-specific NER models into an arbitrary deep neural network in a
	teacher-student training manner. NITE is a general framework that builds upon
	transfer learning and multiple instance learning, which collaboratively not
	only transfers knowledge to a deep student network but also reduces the noise
	from teachers. NITE can help deep learning methods to effectively utilize
	existing resources (i.e., models, labeled and unlabeled data) in a small
	domain. The experiment resulted on Disease NER proved that without using any
	labeled data, NITE can significantly boost the performance of a
	CNN-bidirectional LSTM-CRF NER neural network nearly over 30% in terms of
	F1-score.},
  url       = {https://www.aclweb.org/anthology/D17-1280}
}

@InProceedings{sharma-parekh-talukdar:2017:EMNLP2017,
  author    = {Sharma, Aditya  and  Parekh, Zarana  and  Talukdar, Partha},
  title     = {Speeding up Reinforcement Learning-based Information Extraction Training using Asynchronous Methods},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2658--2663},
  abstract  = {RLIE-DQN is a recently proposed Reinforcement Learning-based Information
	Extraction (IE) technique which is able to incorporate external evidence during
	the extraction process. RLIE-DQN trains a single agent sequentially, training
	on one instance at a time. This results in significant training slowdown which
	is undesirable. We leverage recent advances in parallel RL training using
	asynchronous methods and propose RLIE-A3C. RLIE-A3C trains multiple agents in
	parallel and is able to achieve upto 6x training speedup over RLIE-DQN, while
	suffering no loss in average accuracy.},
  url       = {https://www.aclweb.org/anthology/D17-1281}
}

@InProceedings{li-EtAl:2017:EMNLP20177,
  author    = {Li, Peng-Hsuan  and  Dong, Ruo-Ping  and  Wang, Yu-Siang  and  Chou, Ju-Chieh  and  Ma, Wei-Yun},
  title     = {Leveraging Linguistic Structures for Named Entity Recognition with Bidirectional Recursive Neural Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2664--2669},
  abstract  = {In this paper, we utilize the linguistic structures of texts to improve named
	entity recognition by BRNN-CNN, a special bidirectional recursive network
	attached with a convolutional network. Motivated by the observation that named
	entities are highly related to linguistic constituents, we propose a
	constituent-based BRNN-CNN for named entity recognition. In contrast to
	classical sequential labeling methods, the system first identifies which text
	chunks are possible named entities by whether they are linguistic constituents.
	Then it classifies these chunks with a constituency tree structure by
	recursively propagating syntactic and semantic information to each constituent
	node. This method surpasses current state-of-the-art on OntoNotes 5.0 with
	automatically generated parses.},
  url       = {https://www.aclweb.org/anthology/D17-1282}
}

@InProceedings{strubell-EtAl:2017:EMNLP2017,
  author    = {Strubell, Emma  and  Verga, Patrick  and  Belanger, David  and  McCallum, Andrew},
  title     = {Fast and Accurate Entity Recognition with Iterated Dilated Convolutions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2670--2680},
  abstract  = {Today when many practitioners run basic NLP on the entire web and large-volume
	traffic, faster methods are paramount to saving time and energy costs.
	Recent advances in GPU hardware have led to the emergence of bi-directional
	LSTMs as a standard method for obtaining per-token vector representations
	serving as input to labeling tasks such as NER (often followed by prediction in
	a linear-chain CRF). 
	Though expressive and accurate, these models fail to fully exploit GPU
	parallelism, limiting their computational efficiency.
	This paper proposes a faster alternative to Bi-LSTMs for NER: Iterated Dilated
	Convolutional Neural Networks (ID-CNNs), which have better capacity than
	traditional CNNs for large context and structured prediction.  
	Unlike LSTMs whose sequential processing on sentences of length N requires O(N)
	time even in the face of parallelism, ID-CNNs permit fixed-depth convolutions
	to run in parallel across entire documents.
	We describe a distinct combination of network structure, parameter sharing and
	training procedures that enable dramatic 14-20x test-time speedups while
	retaining accuracy comparable to the Bi-LSTM-CRF. Moreover, ID-CNNs trained to
	aggregate context from the entire document are more accurate than Bi-LSTM-CRFs
	while attaining 8x faster test time speeds.},
  url       = {https://www.aclweb.org/anthology/D17-1283}
}

@InProceedings{gupta-singh-roth:2017:EMNLP2017,
  author    = {Gupta, Nitish  and  Singh, Sameer  and  Roth, Dan},
  title     = {Entity Linking via Joint Encoding of Types, Descriptions, and Context},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2681--2690},
  abstract  = {For accurate entity linking, we need to capture various information aspects of
	an entity, such as its description in a KB, contexts in which it is mentioned,
	and structured knowledge. Additionally, a linking system should work on texts
	from different domains without requiring domain-specific training data or
	hand-engineered features.
	In this work we present a neural, modular entity linking system that learns a
	unified dense representation for each entity using multiple sources of
	information, such as its description, contexts around its mentions, and its
	fine-grained types. We show that the resulting entity linking system is
	effective at combining these sources, and performs competitively, sometimes
	out-performing current state-of-the-art systems across datasets, without
	requiring any domain-specific training data or hand-engineered features. We
	also show that our model can effectively “embed” entities that are new to
	the KB, and is able to link its mentions accurately.},
  url       = {https://www.aclweb.org/anthology/D17-1284}
}

@InProceedings{he-EtAl:2017:EMNLP2017,
  author    = {He, Hua  and  Ganjam, Kris  and  Jain, Navendu  and  Lundin, Jessica  and  White, Ryen  and  Lin, Jimmy},
  title     = {An Insight Extraction System on BioMedical Literature with Deep Neural Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2691--2701},
  abstract  = {Mining biomedical text offers an opportunity to automatically discover
	important facts and infer associations among them. As new scientific findings
	appear across a large collection of biomedical publications, our aim is to tap
	into this literature to automate biomedical knowledge extraction and identify
	important insights from them. Towards that goal, we develop a system with novel
	deep neural networks to extract insights on biomedical literature. Evaluation
	shows our system is able to provide insights with competitive accuracy of human
	acceptance and its relation extraction component outperforms previous work.},
  url       = {https://www.aclweb.org/anthology/D17-1285}
}

@InProceedings{nastase-strapparava:2017:EMNLP2017,
  author    = {Nastase, Vivi  and  Strapparava, Carlo},
  title     = {Word Etymology as Native Language Interference},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2702--2707},
  abstract  = {We present experiments that show the influence of native language on lexical
	choice when producing text in another language -- in this particular case
	English. We start from the premise that non-native English speakers will choose
	lexical items that are close to words in their native language. This leads us
	to an etymology-based representation of documents written by people whose
	mother tongue is an Indo-European language. Based on this representation we
	grow a language family tree, that matches closely the Indo-European language
	tree.},
  url       = {https://www.aclweb.org/anthology/D17-1286}
}

@InProceedings{eisenberg-finlayson:2017:EMNLP2017,
  author    = {Eisenberg, Joshua  and  Finlayson, Mark},
  title     = {A Simpler and More Generalizable Story Detector using Verb and Character Features},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2708--2715},
  abstract  = {Story detection is the task of determining whether or not a unit of text
	contains a story. Prior approaches achieved a maximum performance of 0.66 F1,
	and did not generalize well across different corpora. We present a new
	state-of-the-art detector that achieves a maximum performance of 0.75 F1 (a 14%
	improvement), with significantly greater generalizability than previous work.
	In particular, our detector achieves performance above 0.70 F1 across a variety
	of combinations of lexically different corpora for training and testing, as
	well as dramatic improvements (up to 4,000%) in performance when trained on a
	small, disfluent data set. The new detector uses two basic types of
	features--ones related to events, and ones related to characters--totaling
	283 specific features overall; previous detectors used tens of thousands of
	features, and so this detector represents a significant simplification along
	with increased performance.},
  url       = {https://www.aclweb.org/anthology/D17-1287}
}

@InProceedings{schulz-kuhn:2017:EMNLP2017,
  author    = {Schulz, Sarah  and  Kuhn, Jonas},
  title     = {Multi-modular domain-tailored OCR post-correction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2716--2726},
  abstract  = {One of the main obstacles for many Digital Humanities projects is the low data
	availability. Texts have to be digitized in an expensive and time consuming
	process whereas Optical Character Recognition (OCR) post-correction is one of
	the time-critical factors. At the example of OCR post-correction, we show the
	adaptation of a generic system to solve a specific problem with little data.
	The system accounts for a diversity of errors encountered in OCRed texts coming
	from different time periods in the domain of literature. We show that the
	combination of different approaches, such as e.g. Statistical Machine
	Translation and spell checking, with the help of a ranking mechanism
	tremendously improves over single-handed approaches. Since we consider the
	accessibility of the resulting tool as
	a crucial part of Digital Humanities collaborations, we describe the workflow
	we suggest for efficient text recognition and subsequent automatic and manual
	post-correction},
  url       = {https://www.aclweb.org/anthology/D17-1288}
}

@InProceedings{luo-EtAl:2017:EMNLP2017,
  author    = {Luo, Bingfeng  and  Feng, Yansong  and  Xu, Jianbo  and  Zhang, Xiang  and  Zhao, Dongyan},
  title     = {Learning to Predict Charges for Criminal Cases with Legal Basis},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2727--2736},
  abstract  = {The charge prediction task is to determine appropriate charges for a given
	case, which is helpful for legal assistant systems where the user input is fact
	description. We argue that relevant law articles play an important role in this
	task, and therefore propose an attention-based neural network method to jointly
	model the charge prediction task and the relevant article extraction task in a
	unified framework. The experimental results show that, besides providing legal
	basis, the relevant articles can also clearly improve the charge prediction
	results, and our full model can effectively predict appropriate charges for
	cases with different expression styles.},
  url       = {https://www.aclweb.org/anthology/D17-1289}
}

@InProceedings{schofield-thompson-mimno:2017:EMNLP2017,
  author    = {Schofield, Alexandra  and  Thompson, Laure  and  Mimno, David},
  title     = {Quantifying the Effects of Text Duplication on Semantic Models},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2737--2747},
  abstract  = {Duplicate documents are a pervasive problem in text datasets and can have a
	strong effect on unsupervised models. Methods to remove duplicate texts are
	typically heuristic or very expensive, so it is vital to know when and why they
	are needed. We measure the sensitivity of two latent semantic methods to the
	presence of different levels of document repetition. By artificially creating
	different forms of duplicate text we confirm several hypotheses about how
	repeated text impacts models. While a small amount of duplication is tolerable,
	substantial over-representation of subsets of the text may overwhelm meaningful
	topical patterns.},
  url       = {https://www.aclweb.org/anthology/D17-1290}
}

@InProceedings{zhuang-EtAl:2017:EMNLP2017,
  author    = {Zhuang, Honglei  and  Wang, Chi  and  Tao, Fangbo  and  Kaplan, Lance  and  Han, Jiawei},
  title     = {Identifying Semantically Deviating Outlier Documents},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2748--2757},
  abstract  = {A document outlier is a document that substantially deviates in semantics from
	the majority ones in a corpus.              Automatic identification of document
	outliers
	can be valuable in many applications, such as screening health records for
	medical mistakes.  In this paper, we study the problem of mining semantically
	deviating document outliers in a given corpus.              We develop a generative
	model
	to identify frequent and characteristic semantic regions in the word embedding
	space to represent the given corpus, and a robust outlierness measure which is
	resistant to noisy content in documents.  Experiments conducted on two
	real-world textual data sets show that our method can achieve an up to 135%
	improvement over baselines in terms of recall at top-1% of the outlier ranking.},
  url       = {https://www.aclweb.org/anthology/D17-1291}
}

@InProceedings{kang-EtAl:2017:EMNLP2017,
  author    = {Kang, Dongyeop  and  Gangal, Varun  and  Lu, Ang  and  Chen, Zheng  and  Hovy, Eduard},
  title     = {Detecting and Explaining Causes From Text For a Time Series Event},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2758--2767},
  abstract  = {Explaining underlying causes or effects about events is a challenging but
	valuable task.
	We define a novel problem of generating explanations of a time series event by
	(1) searching cause and effect relationships of the time series with textual
	data and (2) constructing a connecting chain between them to generate an
	explanation.
	To detect causal features from text, we propose a novel method based on the
	Granger causality of time series between features extracted from text such as
	N-grams, topics, sentiments, and their composition.
	The generation of the sequence of causal entities requires a commonsense
	causative knowledge base with efficient reasoning. 
	To ensure good interpretability and appropriate lexical usage we combine
	symbolic and neural representations, using a neural reasoning algorithm trained
	on commonsense causal tuples to predict the next cause step.
	Our quantitative and human analysis show empirical evidence that our method
	successfully extracts meaningful causality relationships between time series
	with textual features and generates appropriate explanation between them.},
  url       = {https://www.aclweb.org/anthology/D17-1292}
}

@InProceedings{jiang-EtAl:2017:EMNLP2017,
  author    = {Jiang, Zhuoxuan  and  Feng, Shanshan  and  Cong, Gao  and  Miao, Chunyan  and  Li, Xiaoming},
  title     = {A Novel Cascade Model for Learning Latent Similarity from Heterogeneous Sequential Data of MOOC},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2768--2773},
  abstract  = {Recent years have witnessed the proliferation of Massive Open Online Courses
	(MOOCs). With massive learners being offered MOOCs, there is a demand that the
	forum contents within MOOCs need to be classified in order to facilitate both
	learners and instructors. Therefore we investigate a significant application,
	which is to associate forum threads to subtitles of video clips. This task can
	be regarded as a document ranking problem, and the key is how to learn a
	distinguishable text representation from word sequences and learners' behavior
	sequences. In this paper, we propose a novel cascade model, which can capture
	both the latent semantics and latent similarity by modeling MOOC data.
	Experimental results on two real-world datasets demonstrate that our textual
	representation outperforms state-of-the-art unsupervised counterparts for the
	application.},
  url       = {https://www.aclweb.org/anthology/D17-1293}
}

@InProceedings{mysoresathyendra-EtAl:2017:EMNLP2017,
  author    = {Mysore Sathyendra, Kanthashree  and  Wilson, Shomir  and  Schaub, Florian  and  Zimmeck, Sebastian  and  Sadeh, Norman},
  title     = {Identifying the Provision of Choices in Privacy Policy Text},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2774--2779},
  abstract  = {Websites' and mobile apps' privacy policies, written in natural language, tend
	to be long and difficult to understand. Information privacy revolves around the
	fundamental principle of Notice and choice, namely the idea that users should
	be able to make informed decisions about what information about them can be
	collected and how it can be used. Internet users want control over their
	privacy, but their choices are often hidden in long and convoluted privacy
	policy texts. Moreover, little (if any) prior work has been done to detect the
	provision of choices in text. We address this challenge of enabling user choice
	by automatically identifying and extracting pertinent choice language in
	privacy policies. In particular, we present a two-stage architecture of
	classification models to identify opt-out choices in privacy policy text,
	labelling common varieties of choices with a mean F1 score of 0.735. Our
	techniques enable the creation of systems to help Internet users to learn about
	their choices, thereby effectuating notice and choice and improving Internet
	privacy.},
  url       = {https://www.aclweb.org/anthology/D17-1294}
}

@InProceedings{goyal-EtAl:2017:EMNLP2017,
  author    = {Goyal, Tanya  and  Kelkar, Sachin  and  Agarwal, Manas  and  Grover, Jeenu},
  title     = {An Empirical Analysis of Edit Importance between Document Versions},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2780--2784},
  abstract  = {In this paper, we present a novel approach to infer significance of various
	textual  edits to documents. An author may make several edits to a document;
	each edit varies in its impact to the content of the document. While some edits
	are surface changes and introduce negligible change, other edits may change the
	content/tone of the document significantly. In this paper, we perform an
	analysis on the human perceptions of edit importance while reviewing documents
	from one version to the next. We identify linguistic features that influence
	edit importance and model it in a regression based setting. We show that the
	predicted importance by our approach is highly correlated with the human
	perceived importance, established by a Mechanical Turk study.},
  url       = {https://www.aclweb.org/anthology/D17-1295}
}

@InProceedings{wang-EtAl:2017:EMNLP20178,
  author    = {Wang, Shaolei  and  Che, Wanxiang  and  Zhang, Yue  and  Zhang, Meishan  and  Liu, Ting},
  title     = {Transition-Based Disfluency Detection using LSTMs},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2785--2794},
  abstract  = {In this paper, we model the problem of disfluency detection using a
	transition-based framework, which incrementally constructs and labels the
	disfluency chunk of input sentences using a new transition system without
	syntax information. Compared with sequence labeling methods, it can capture
	non-local chunk-level features; compared with joint parsing and disfluency
	detection methods, it is free for noise in syntax. Experiments show that our
	model achieves state-of-the-art f-score of 87.5\% on the commonly used English
	Switchboard test set, and a set of  in-house annotated Chinese data.},
  url       = {https://www.aclweb.org/anthology/D17-1296}
}

@InProceedings{yannakoudakis-EtAl:2017:EMNLP2017,
  author    = {Yannakoudakis, Helen  and  Rei, Marek  and  Andersen, {\O}istein E.  and  Yuan, Zheng},
  title     = {Neural Sequence-Labelling Models for Grammatical Error Correction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2795--2806},
  abstract  = {We propose an approach to N -best list re-
	ranking using neural sequence-labelling
	models. We train a compositional model
	for error detection that calculates the prob-
	ability of each token in a sentence being
	correct or incorrect, utilising the full sen-
	tence as context. Using the error detec-
	tion model, we then re-rank the N best
	hypotheses generated by statistical ma-
	chine translation systems. Our approach
	achieves state-of-the-art results on error
	correction for three different datasets, and
	it has the additional advantage of only us-
	ing a small set of easily computed features
	that require no linguistic input.},
  url       = {https://www.aclweb.org/anthology/D17-1297}
}

@InProceedings{schmaltz-EtAl:2017:EMNLP2017,
  author    = {Schmaltz, Allen  and  Kim, Yoon  and  Rush, Alexander  and  Shieber, Stuart},
  title     = {Adapting Sequence Models for Sentence Correction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2807--2813},
  abstract  = {In a controlled experiment of sequence-to-sequence approaches for the task of
	sentence correction, we find that character-based models are generally more
	effective than word-based models and models that encode subword information via
	convolutions, and that modeling the output data as a series of diffs improves
	effectiveness over standard approaches. Our strongest sequence-to-sequence
	model improves over our strongest phrase-based statistical machine translation
	model, with access to the same data, by 6 M2 (0.5 GLEU) points. Additionally,
	in the data environment of the standard CoNLL-2014 setup, we demonstrate that
	modeling (and tuning against) diffs yields similar or better M2 scores with
	simpler models and/or significantly less data than previous
	sequence-to-sequence approaches.
	Author{4}{Affiliation}},
  url       = {https://www.aclweb.org/anthology/D17-1298}
}

@InProceedings{niu-martindale-carpuat:2017:EMNLP2017,
  author    = {Niu, Xing  and  Martindale, Marianna  and  Carpuat, Marine},
  title     = {A Study of Style in Machine Translation: Controlling the Formality of Machine Translation Output},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2814--2819},
  abstract  = {Stylistic variations of language, such as formality, carry speakers' intention
	beyond literal meaning and should be conveyed adequately in translation. We
	propose to use lexical formality models to control the formality level of
	machine translation output. We demonstrate the effectiveness of our approach in
	empirical evaluations, as measured by automatic metrics and human assessments.},
  url       = {https://www.aclweb.org/anthology/D17-1299}
}

@InProceedings{devlin:2017:EMNLP2017,
  author    = {Devlin, Jacob},
  title     = {Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2820--2825},
  abstract  = {Attentional sequence-to-sequence models have become the new standard for
	machine translation, but one challenge of such models is a significant increase
	in training and decoding cost compared to phrase-based systems. In this work we
	focus on efficient decoding, with a goal of achieving accuracy close the
	state-of-the-art in neural machine translation (NMT), while achieving CPU
	decoding speed/throughput close to that of a phrasal decoder.
	We approach this problem from two angles: First, we describe several techniques
	for speeding up an NMT beam search decoder, which obtain a 4.4x speedup over a
	very efficient baseline decoder without changing the decoder output. Second, we
	propose a simple but powerful network architecture which uses an RNN (GRU/LSTM)
	layer at bottom, followed by a series of stacked fully-connected layers applied
	at every timestep. This architecture achieves similar accuracy to a deep
	recurrent model, at a small fraction of the training and decoding cost. By
	combining these techniques, our best system achieves a very competitive
	accuracy of 38.3 BLEU on WMT English-French NewsTest2014, while decoding at 100
	words/sec on single-threaded CPU. We believe this is the best published
	accuracy/speed trade-off of an NMT system.},
  url       = {https://www.aclweb.org/anthology/D17-1300}
}

@InProceedings{wang-EtAl:2017:EMNLP20179,
  author    = {Wang, Longyue  and  Tu, Zhaopeng  and  Way, Andy  and  Liu, Qun},
  title     = {Exploiting Cross-Sentence Context for Neural Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2826--2831},
  abstract  = {In translation, considering the document as a whole can help to resolve
	ambiguities and inconsistencies. In this paper, we propose a cross-sentence
	context-aware approach and investigate the influence of historical contextual
	information on the performance of neural machine translation (NMT). First, this
	history is summarized in a hierarchical way. We then integrate the historical
	representation into NMT in two strategies: 1) a warm-start of encoder and
	decoder states, and 2) an auxiliary context source for updating decoder states.
	Experimental results on a large Chinese-English translation task show that our
	approach significantly improves upon a strong attention-based NMT system by up
	to +2.1 BLEU points.},
  url       = {https://www.aclweb.org/anthology/D17-1301}
}

@InProceedings{kim-EtAl:2017:EMNLP2017,
  author    = {Kim, Joo-Kyung  and  Kim, Young-Bum  and  Sarikaya, Ruhi  and  Fosler-Lussier, Eric},
  title     = {Cross-Lingual Transfer Learning for POS Tagging without Cross-Lingual Resources},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2832--2838},
  abstract  = {Training a POS tagging model with crosslingual transfer learning usually
	requires linguistic knowledge and resources about the relation between the
	source language and the target language. In this paper, we introduce a
	cross-lingual transfer learning model for POS tagging without ancillary
	resources such as parallel corpora. The proposed cross-lingual model utilizes a
	common BLSTM that enables knowledge transfer from other languages, and private
	BLSTMs for language-specific representations. The cross-lingual model is
	trained with language-adversarial training and bidirectional language modeling
	as auxiliary objectives to better represent language-general information while
	not losing the information about a specific target language. Evaluating on POS
	datasets from 14 languages in the Universal Dependencies corpus, we show that
	the proposed transfer learning model improves
	the POS tagging performance of the target languages without exploiting any
	linguistic knowledge between the source language and the target language.},
  url       = {https://www.aclweb.org/anthology/D17-1302}
}

@InProceedings{gella-EtAl:2017:EMNLP2017,
  author    = {Gella, Spandana  and  Sennrich, Rico  and  Keller, Frank  and  Lapata, Mirella},
  title     = {Image Pivoting for Learning Multilingual Multimodal Representations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2839--2845},
  abstract  = {In this paper we propose a model to
	learn multimodal multilingual representations
	for matching images and sentences
	in different languages, with the aim of
	advancing multilingual versions of image
	search and image understanding. Our
	model learns a common representation for
	images and their descriptions in two different
	languages (which need not be parallel)
	by considering the image as a pivot between
	two languages. We introduce a new
	pairwise ranking loss function which can
	handle both symmetric and asymmetric
	similarity between the two modalities. We
	evaluate our models on image-description
	ranking for German and English, and on
	semantic textual similarity of image descriptions
	in English. In both cases we
	achieve state-of-the-art performance.},
  url       = {https://www.aclweb.org/anthology/D17-1303}
}

@InProceedings{chen-EtAl:2017:EMNLP20173,
  author    = {Chen, Kehai  and  Wang, Rui  and  Utiyama, Masao  and  Liu, Lemao  and  Tamura, Akihiro  and  Sumita, Eiichiro  and  Zhao, Tiejun},
  title     = {Neural Machine Translation with Source Dependency Representation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2846--2852},
  abstract  = {Source dependency information has been successfully introduced into statistical
	machine translation. However, there are only a few preliminary attempts for
	Neural Machine Translation (NMT), such as concatenating representations of
	source word and its dependency label together. In this paper, we propose a
	novel NMT with source dependency representation to improve translation
	performance of NMT, especially long sentences. Empirical results on NIST
	Chinese-to-English translation task show that our method achieves 1.6 BLEU
	improvements on average over a strong NMT system.},
  url       = {https://www.aclweb.org/anthology/D17-1304}
}

@InProceedings{han-martinezgomez-mineshima:2017:EMNLP2017,
  author    = {Han, Dan  and  Mart\'{i}nez-G\'{o}mez, Pascual  and  Mineshima, Koji},
  title     = {Visual Denotations for Recognizing Textual Entailment},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2853--2859},
  abstract  = {In the logic approach to Recognizing Textual Entailment, identifying
	phrase-to-phrase semantic relations is still an unsolved problem. Resources
	such as the Paraphrase Database offer limited coverage despite their large size
	whereas unsupervised distributional models of meaning often fail to recognize
	phrasal entailments. We propose to map phrases to their visual denotations and
	compare their meaning in terms of their images. We show that our approach is
	effective in the task of Recognizing Textual Entailment when combined with
	specific linguistic and logic features.},
  url       = {https://www.aclweb.org/anthology/D17-1305}
}

@InProceedings{mathur-baldwin-cohn:2017:EMNLP2017,
  author    = {Mathur, Nitika  and  Baldwin, Timothy  and  Cohn, Trevor},
  title     = {Sequence Effects in Crowdsourced Annotations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2860--2865},
  abstract  = {Manual data annotation is a vital component of NLP research. When designing
	annotation tasks, properties of the annotation interface can unintentionally
	lead to artefacts in the resulting dataset, biasing the evaluation. In this
	paper, we explore sequence effects where annotations of an item are affected by
	the preceding items. Having assigned one label to an instance, the annotator
	may be less (or more) likely to assign the same label to the next. During
	rating tasks, seeing a low quality item may affect the score given to the next
	item either positively or negatively. We see clear evidence of both types of
	effects using auto-correlation studies over three different crowdsourced
	datasets. We then recommend a simple way to minimise sequence effects.},
  url       = {https://www.aclweb.org/anthology/D17-1306}
}

@InProceedings{ture-jojic:2017:EMNLP2017,
  author    = {Ture, Ferhan  and  Jojic, Oliver},
  title     = {No Need to Pay Attention: Simple Recurrent Neural Networks Work!},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2866--2872},
  abstract  = {First-order factoid question answering assumes that the question can be
	answered by a single fact in a knowledge base (KB). While this does not seem
	like a challenging task, many recent attempts that apply either complex
	linguistic reasoning or deep neural networks achieve 65\%--76\% accuracy on
	benchmark
	sets. Our approach formulates the task as two machine learning problems:\
	detecting the entities in the question, and classifying the question as one of
	the relation types in the KB. We train a recurrent neural network to solve each
	problem. On the SimpleQuestions dataset, our approach yields substantial
	improvements over previously published results --- even neural networks based
	on much more complex architectures. The simplicity of our approach also has
	practical advantages, such as efficiency and modularity, that are valuable
	especially in an industry setting. In fact, we present a preliminary analysis
	of the performance of our model on real queries from Comcast's X1 entertainment
	platform with millions of users every day.},
  url       = {https://www.aclweb.org/anthology/D17-1307}
}

@InProceedings{mimno-thompson:2017:EMNLP2017,
  author    = {Mimno, David  and  Thompson, Laure},
  title     = {The strange geometry of skip-gram with negative sampling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2873--2878},
  abstract  = {Despite their ubiquity, word embeddings trained with skip-gram negative
	sampling (SGNS) remain poorly understood. We find that vector positions are not
	simply determined by semantic similarity, but rather occupy a narrow cone,
	diametrically opposed to the context vectors. We show that this geometric
	concentration depends on the ratio of positive to negative examples, and that
	it is neither theoretically nor empirically inherent in related embedding
	algorithms.},
  url       = {https://www.aclweb.org/anthology/D17-1308}
}

@InProceedings{botha-EtAl:2017:EMNLP2017,
  author    = {Botha, Jan A.  and  Pitler, Emily  and  Ma, Ji  and  Bakalov, Anton  and  Salcianu, Alex  and  Weiss, David  and  McDonald, Ryan  and  Petrov, Slav},
  title     = {Natural Language Processing with Small Feed-Forward Networks},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2879--2885},
  abstract  = {We show that small and shallow feed-forward neural networks can achieve near
	state-of-the-art results on a range of unstructured and structured language
	processing tasks while being considerably cheaper in memory and computational
	requirements than deep recurrent models.
	Motivated by resource-constrained environments like mobile phones, we showcase
	simple techniques for obtaining such small neural network models, and
	investigate different tradeoffs when deciding how to allocate a small memory
	budget.},
  url       = {https://www.aclweb.org/anthology/D17-1309}
}

@InProceedings{li-lam:2017:EMNLP2017,
  author    = {Li, Xin  and  Lam, Wai},
  title     = {Deep Multi-Task Learning for Aspect Term Extraction with Memory Interaction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2886--2892},
  abstract  = {We propose a novel LSTM-based deep multi-task learning framework for aspect
	term extraction from user review sentences. Two LSTMs equipped with extended
	memories and neural memory operations are designed for jointly handling the
	extraction tasks of aspects and opinions via memory interactions. Sentimental
	sentence constraint is also added for more accurate prediction via another
	LSTM. Experiment results over two benchmark datasets demonstrate the
	effectiveness of our framework.},
  url       = {https://www.aclweb.org/anthology/D17-1310}
}

@InProceedings{andreas-klein:2017:EMNLP2017,
  author    = {Andreas, Jacob  and  Klein, Dan},
  title     = {Analogs of Linguistic Structure in Deep Representations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2893--2897},
  abstract  = {We investigate the compositional structure of message vectors computed by a
	deep
	network trained on a communication game. By comparing truth-conditional
	representations of encoder-produced message vectors to human-produced referring
	expressions, we are able to identify aligned (vector, utterance) pairs with the
	same meaning. We then search for structured relationships among these aligned
	pairs to discover simple vector space transformations corresponding to
	negation,
	conjunction, and disjunction. Our results suggest that neural representations
	are capable of spontaneously developing a ``syntax'' with functional analogues
	to qualitative properties of natural language.},
  url       = {https://www.aclweb.org/anthology/D17-1311}
}

@InProceedings{yang-lu-zheng:2017:EMNLP2017,
  author    = {Yang, Wei  and  Lu, Wei  and  Zheng, Vincent},
  title     = {A Simple Regularization-based Algorithm for Learning Cross-Domain Word Embeddings},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2898--2904},
  abstract  = {Learning word embeddings has received a significant amount of attention
	recently. Often, word embeddings are learned in an unsupervised manner from a
	large collection of text. The genre of the text typically plays an important
	role in the effectiveness of the resulting embeddings. How to effectively train
	word embedding models using data from different domains remains a problem that
	is less explored. In this paper, we present a simple yet effective method for
	learning word embeddings based on text from different domains. We demonstrate
	the effectiveness of our approach through extensive experiments on various
	down-stream NLP tasks.},
  url       = {https://www.aclweb.org/anthology/D17-1312}
}

@InProceedings{noriegaatala-EtAl:2017:EMNLP2017,
  author    = {Noriega-Atala, Enrique  and  Valenzuela-Esc\'{a}rcega, Marco A.  and  Morrison, Clayton  and  Surdeanu, Mihai},
  title     = {Learning what to read: Focused machine reading},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2905--2910},
  abstract  = {Recent efforts in bioinformatics have achieved tremendous progress in the ma-
	chine reading of biomedical literature, and the assembly of the extracted
	biochem- ical interactions into large-scale models such as protein signaling
	pathways. How- ever, batch machine reading of literature at today’s scale
	(PubMed alone indexes over 1 million papers per year) is unfea- sible due to
	both cost and processing over- head. In this work, we introduce a focused
	reading approach to guide the machine reading of biomedical literature towards
	what literature should be read to answer a biomedical query as efficiently as
	pos- sible. We introduce a family of algorithms for focused reading, including
	an intuitive, strong baseline, and a second approach which uses a reinforcement
	learning (RL) framework that learns when to explore (widen the search) or
	exploit (narrow it). We demonstrate that the RL approach is capable of
	answering more queries than the baseline, while being more efficient, i.e.,
	reading fewer documents.},
  url       = {https://www.aclweb.org/anthology/D17-1313}
}

@InProceedings{shu-xu-liu:2017:EMNLP2017,
  author    = {Shu, Lei  and  Xu, Hu  and  Liu, Bing},
  title     = {DOC: Deep Open Classification of Text Documents},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2911--2916},
  abstract  = {Traditional supervised learning makes the closed-world assumption that the
	classes appeared in the test data must have appeared in training. This also
	applies to text learning or text classification. As learning is used
	increasingly in dynamic open environments where some new/test documents may not
	belong to any of the training classes, identifying these novel documents during
	classification presents an important problem. This problem is called open-world
	classification or open classification. This paper proposes a novel deep
	learning based approach. It outperforms existing state-of-the-art techniques
	dramatically.},
  url       = {https://www.aclweb.org/anthology/D17-1314}
}

@InProceedings{gangal-EtAl:2017:EMNLP2017,
  author    = {Gangal, Varun  and  Jhamtani, Harsh  and  Neubig, Graham  and  Hovy, Eduard  and  Nyberg, Eric},
  title     = {Charmanteau: Character Embedding Models For Portmanteau Creation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2917--2922},
  abstract  = {Portmanteaus are a word formation phenomenon where two words combine into a new
	word. We propose character-level neural sequence-to-sequence (S2S) methods for
	the task of portmanteau generation that are end-to-end-trainable, language
	independent, and do not explicitly use additional phonetic information. We
	propose a noisy-channel-style model, which allows for the incorporation of
	unsupervised word lists, improving performance over a standard source-to-target
	model. This model is made possible by an exhaustive candidate generation
	strategy specifically enabled by the features of the portmanteau task.
	Experiments find our approach superior to a state-of-the-art FST-based baseline
	with respect to ground truth accuracy and human evaluation.},
  url       = {https://www.aclweb.org/anthology/D17-1315}
}

@InProceedings{gutierrez-EtAl:2017:EMNLP2017,
  author    = {Gutierrez, E. Dario  and  Cecchi, Guillermo  and  Corcoran, Cheryl  and  Corlett, Philip},
  title     = {Using Automated Metaphor Identification to Aid in Detection and Prediction of First-Episode Schizophrenia},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2923--2930},
  abstract  = {The diagnosis of serious mental health conditions such as schizophrenia is
	based on the judgment of clinicians whose training takes several years, and
	cannot be easily formalized into objective measures. However, previous research
	suggests there are disturbances in aspects of the language use of patients with
	schizophrenia.                                      
	Using metaphor-identification and sentiment-analysis
	algorithms to automatically generate features, we create a classifier,               
	     
	that,
	with high
	accuracy, can predict which patients will develop (or currently suffer from)
	schizophrenia.                                      
	To our knowledge, this study is the first to demonstrate
	the utility of automated metaphor identification algorithms for detection or
	prediction of disease.},
  url       = {https://www.aclweb.org/anthology/D17-1316}
}

@InProceedings{rashkin-EtAl:2017:EMNLP2017,
  author    = {Rashkin, Hannah  and  Choi, Eunsol  and  Jang, Jin Yea  and  Volkova, Svitlana  and  Choi, Yejin},
  title     = {Truth of Varying Shades: Analyzing Language in Fake News and Political Fact-Checking},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2931--2937},
  abstract  = {We present an analytic study on the language of news media in the context of
	political fact-checking and fake news detection. We compare the language of
	real news with that of satire, hoaxes, and propaganda to find linguistic
	characteristics of untrustworthy text. To probe the feasibility of automatic
	political fact-checking, we also present a case study based on PolitiFact.com
	using their factuality judgments on a 6-point scale. Experiments show that
	while media fact-checking remains to be an open research question, stylistic
	cues can help determine the truthfulness of text.},
  url       = {https://www.aclweb.org/anthology/D17-1317}
}

@InProceedings{menini-EtAl:2017:EMNLP2017,
  author    = {Menini, Stefano  and  Nanni, Federico  and  Ponzetto, Simone Paolo  and  Tonelli, Sara},
  title     = {Topic-Based Agreement and Disagreement in US Electoral Manifestos},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2938--2944},
  abstract  = {We present a topic-based analysis of agreement and disagreement in political
	manifestos, which relies on a new method for topic detection based on key
	concept  clustering. Our approach outperforms both standard techniques like LDA
	and a state-of-the-art graph-based method, and provides promising initial
	results for this new task in computational social science.},
  url       = {https://www.aclweb.org/anthology/D17-1318}
}

@InProceedings{xu-koehn:2017:EMNLP2017,
  author    = {Xu, Hainan  and  Koehn, Philipp},
  title     = {Zipporah: a Fast and Scalable Data Cleaning System for Noisy Web-Crawled Parallel Corpora},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2945--2950},
  abstract  = {We introduce Zipporah, a fast and scalable data cleaning system. We propose a
	novel type of bag-of-words translation feature, and train logistic regression
	models to classify good data and synthetic noisy data in the proposed feature
	space. The trained model is used to score parallel sentences in the data pool
	for selection. As shown in experiments, Zipporah selects a high-quality
	parallel corpus from a large, mixed quality data pool. In particular, for one
	noisy dataset, Zipporah achieves a 2.1 BLEU score improvement with using 1/5 of
	the data over using the entire corpus.},
  url       = {https://www.aclweb.org/anthology/D17-1319}
}

@InProceedings{falke-gurevych:2017:EMNLP2017,
  author    = {Falke, Tobias  and  Gurevych, Iryna},
  title     = {Bringing Structure into Summaries: Crowdsourcing a Benchmark Corpus of Concept Maps},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2951--2961},
  abstract  = {Concept maps can be used to concisely represent important information and bring
	structure into large document collections. Therefore, we study a variant of
	multi-document summarization that produces summaries in the form of concept
	maps. However, suitable evaluation datasets for this task are currently
	missing. To close this gap, we present a newly created corpus of concept maps
	that summarize heterogeneous collections of web documents on educational
	topics. It was created using a novel crowdsourcing approach that allows us to
	efficiently determine important elements in large document collections. We
	release the corpus along with a baseline system and proposed evaluation
	protocol to enable further research on this variant of summarization.},
  url       = {https://www.aclweb.org/anthology/D17-1320}
}

@InProceedings{kottur-EtAl:2017:EMNLP2017,
  author    = {Kottur, Satwik  and  Moura, Jos\'{e}  and  Lee, Stefan  and  Batra, Dhruv},
  title     = {Natural Language Does Not Emerge ‘Naturally’ in Multi-Agent Dialog},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2962--2967},
  abstract  = {A number of recent works have proposed techniques for end-to-end learning of
	communication protocols among cooperative multi-agent populations, and have
	simultaneously found the emergence of grounded human-interpretable language in
	the protocols developed by the agents, learned without any human supervision!
	In this paper, using a Task \& Talk reference game between two agents as a
	testbed,  we present a sequence of `negative' results culminating in a
	`positive' one -- showing that while most agent-invented languages are
	effective (i.e. achieve near-perfect task rewards), they are decidedly not
	interpretable or compositional. In essence, we find that natural language does
	not emerge `naturally',despite the semblance of ease of
	natural-language-emergence that one may gather from recent literature. We
	discuss how it is possible to coax the invented languages to become more and
	more human-like and compositional by increasing restrictions on how two agents
	may communicate.},
  url       = {https://www.aclweb.org/anthology/D17-1321}
}

@InProceedings{yates-cohan-goharian:2017:EMNLP2017,
  author    = {Yates, Andrew  and  Cohan, Arman  and  Goharian, Nazli},
  title     = {Depression and Self-Harm Risk Assessment in Online Forums},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2968--2978},
  abstract  = {Users suffering from mental health conditions often turn to online resources
	for support, including specialized online support communities or general
	communities such as Twitter and Reddit. In this work, we present a framework
	for supporting and studying users in both types of communities. We propose
	methods for identifying posts in support communities that may indicate a risk
	of self-harm, and demonstrate that our approach outperforms strong previously
	proposed methods for identifying such posts. Self-harm is closely related to
	depression, which makes identifying depressed users on general forums a crucial
	related task. We introduce a large-scale general forum dataset consisting of
	users with self-reported depression diagnoses matched with control users. We
	show how our method can be applied to effectively identify depressed users from
	their use of language alone. We demonstrate that our method outperforms strong
	baselines on this general forum dataset.},
  url       = {https://www.aclweb.org/anthology/D17-1322}
}

@InProceedings{zhao-EtAl:2017:EMNLP20173,
  author    = {Zhao, Jieyu  and  Wang, Tianlu  and  Yatskar, Mark  and  Ordonez, Vicente  and  Chang, Kai-Wei},
  title     = {Men Also Like Shopping: Reducing Gender Bias Amplification using Corpus-level Constraints},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2979--2989},
  abstract  = {Language is increasingly being used to de-fine rich visual recognition problems
	with supporting image collections sourced from the web. Structured prediction
	models are used  in  these  tasks  to  take  advantage              of correlations 
	between  co-occurring  labels and visual input but risk inadvertently encoding
	social biases found in web corpora. In this work, we study data and models
	associated with multilabel object classification and visual semantic role
	labeling. We find that (a) datasets for these tasks contain significant gender
	bias and (b) models  trained  on  these  datasets  further  amplify existing
	bias.             For example,  the activity cooking is over 33% more likely to 
	involve 
	females  than  males  in  a  training set, and a trained model further
	amplifies the disparity to 68% at test time.  We propose to inject corpus-level
	constraints for calibrating existing structured prediction models and design an
	algorithm based on Lagrangian relaxation for collective inference.  Our method
	results in almost no performance loss for the underlying recognition task but
	decreases the magnitude of bias amplification by 47.5% and 40.5% for multilabel
	classification and visual semantic role labeling, respectively。},
  url       = {https://www.aclweb.org/anthology/D17-1323}
}

