<volume id='Q16'>
  <paper id='1000'>
    <title>Transactions of the Association of Computational Linguistics – Volume 4, Issue 1</title>
  </paper>

  <paper id='1001'>
    <title>Morpho-syntactic Lexicon Generation Using Graph-based Semi-supervised Learning</title>
    <author><first>Manaal</first><last>Faruqui</last></author>
    <author><first>Ryan</first><last>McDonald</last></author>
    <author><first>Radu</first><last>Soricut</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/730/166</href>
    <pages>1--16</pages>
    <url>http://www.aclweb.org/anthology/Q16-1001</url>
  </paper>

  <paper id='1002'>
    <title>Learning to Understand Phrases by Embedding the Dictionary</title>
    <author><first>Felix</first><last>Hill</last></author>
    <author><first>KyungHyun</first><last>Cho</last></author>
    <author><first>Anna</first><last>Korhonen</last></author>
    <author><first>Yoshua</first><last>Bengio</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/711/168</href>
    <pages>17--30</pages>
    <url>http://www.aclweb.org/anthology/Q16-1002</url>
  </paper>

  <paper id='1003'>
    <title>A Bayesian Model of Diachronic Meaning Change</title>
    <author><first>Lea</first><last>Frermann</last></author>
    <author><first>Mirella</first><last>Lapata</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/796/169</href>
    <pages>31--45</pages>
    <url>http://www.aclweb.org/anthology/Q16-1003</url>
  </paper>

  <paper id='1004'>
    <title>Detecting Cross-cultural Differences Using a Multilingual Topic Model</title>
    <author><first>E. D.</first><last>Gutiérrez</last></author>
    <author><first>Ekaterina</first><last>Shutova</last></author>
    <author><first>Patricia</first><last>Lichtenstein</last></author>
    <author><first>Gerard</first><last>de Melo</last></author>
    <author><first>Luca</first><last>Gilardi</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/755/171</href>
    <pages>47-60</pages>
    <url>http://www.aclweb.org/anthology/Q16-1004</url>
  </paper>

  <paper id='1005'>
    <title>An Empirical Analysis of Formality in Online Communication</title>
    <author><first>Ellie</first><last>Pavlick</last></author>
    <author><first>Joel</first><last>Tetreault</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/732/173</href>
    <pages>61-74</pages>
    <url>http://www.aclweb.org/anthology/Q16-1005</url>
  </paper>

  <paper id='1006'>
    <title>Decoding Anagrammed Texts Written in an Unknown Language and Script</title>
    <author><first>Bradley</first><last>Hauer</last></author>
    <author><first>Grzegorz</first><last>Kondrak</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/821/174</href>
    <pages>75-86</pages>
    <url>http://www.aclweb.org/anthology/Q16-1006</url>
  </paper>

  <paper id='1007'>
    <title>Learning Tier-based Strictly 2-Local Languages</title>
    <author><first>Adam</first><last>Jardine</last></author>
    <author><first>Jeffrey</first><last>Heinz</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/694/175</href>
    <pages>87-98</pages>
    <url>http://www.aclweb.org/anthology/Q16-1007</url>
  </paper>

  <paper id='1008'>
    <title>Adapting to All Domains at Once: Rewarding Domain Invariance in SMT</title>
    <author><first>Hoang</first><last>Cuong</last></author>
    <author><first>Khalil</first><last>Sima'an</last></author>
    <author><first>Ivan</first><last>Titov</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/768/176</href>
    <pages>99-112</pages>
    <url>http://www.aclweb.org/anthology/Q16-1008</url>
  </paper>

  <paper id='1009'>
    <title>A Joint Model for Answer Sentence Ranking and Answer Extraction</title>
    <author><first>Md Arafat</first><last>Sultan</last></author>
    <author><first>Vittorio</first><last>Castelli</last></author>
    <author><first>Radu</first><last>Florian</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/738/177</href>
    <pages>113-125</pages>
    <url>http://www.aclweb.org/anthology/Q16-1009</url>
  </paper>

  <paper id='1010'>
    <title>Transforming Dependency Structures to Logical Forms for Semantic Parsing</title>
    <author><first>Siva</first><last>Reddy</last></author>
    <author><first>Oscar</first><last>Täckström</last></author>
    <author><first>Michael</first><last>Collins</last></author>
    <author><first>Tom</first><last>Kwiatkowski</last></author>
    <author><first>Dipanjan</first><last>Das</last></author>
    <author><first>Mark</first><last>Steedman</last></author>
    <author><first>Mirella</first><last>Lapata</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/807/178</href>
    <pages>127-141</pages>
    <url>http://www.aclweb.org/anthology/Q16-1010</url>
  </paper>

  <paper id='1011'>
    <title>Concept Grounding to Multiple Knowledge Bases via Indirect Supervision</title>
    <author><first>Chen-Tse</first><last>Tsai</last></author>
    <author><first>Dan</first><last>Roth</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/608/181</href>
    <pages>141-154</pages>
    <url>http://www.aclweb.org/anthology/Q16-1011</url>
  </paper>

  <paper id='1012'>
    <title>Learning to Make Inferences in a Semantic Parsing Task</title>
    <author><first>Kyle</first><last>Richardson</last></author>
    <author><first>Jonas</first><last>Kuhn</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/733/183</href>
    <pages>155-168</pages>
    <url>http://www.aclweb.org/anthology/Q16-1012</url>
  </paper>

  <paper id='1013'>
    <title>Reassessing the Goals of Grammatical Error Correction: Fluency Instead of Grammaticality</title>
    <author><first>Keisuke</first><last>Sakaguchi</last></author>
    <author><first>Courtney</first><last>Napoles</last></author>
    <author><first>Matt</first><last>Post</last></author>
    <author><first>Joel</first><last>Tetreault</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/800/185</href>
    <pages>169-182</pages>
    <url>http://www.aclweb.org/anthology/Q16-1013</url>
  </paper>

  <paper id='1014'>
    <title>Efficient Structured Inference for Transition-Based Parsing with Neural Networks and Error States</title>
    <author><first>Ashish</first><last>Vaswani</last></author>
    <author><first>Kenji</first><last>Sagae</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/659/186</href>
    <pages>183-196</pages>
    <url>http://www.aclweb.org/anthology/Q16-1014</url>
  </paper>

  <paper id='1015'>
    <title>Generating Training Data for Semantic Role Labeling based on Label Transfer from Linked Lexical Resources</title>
    <author><first>Silvana</first><last>Hartmann</last></author>
    <author><first>Eckle-Judith</first><last>Kohler</last></author>
    <author><first>Iryna</first><last>Gurevych</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/717/187</href>
    <pages>197-213</pages>
    <url>http://www.aclweb.org/anthology/Q16-1015</url>
  </paper>
  
  <paper id='1016'>
    <title>J-NERD: Joint Named Entity Recognition and Disambiguation with Rich Linguistic Features</title>
    <author><first>Dat</first><last>Nguyen</last></author>
    <author><first>Martin</first><last>Theobald</last></author>
    <author><first>Gerhard</first><last>Weikum</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/698/189</href>
    <pages>215-229</pages>
    <url>http://www.aclweb.org/anthology/Q16-1016</url>
  </paper>

  <paper id='1017'>
    <title>Discrete-State Variational Autoencoders for Joint Discovery and Factorization of Relations</title>
    <author><first>Diego</first><last>Marcheggiani</last></author>
    <author><first>Ivan</first><last>Titov</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/761/190</href>
    <pages>231-244</pages>
    <url>http://www.aclweb.org/anthology/Q16-1017</url>
  </paper>

  <paper id='1018'>
    <title>Unsupervised Part-Of-Speech Tagging with Anchor Hidden Markov Models</title>
    <author><first>Karl</first><last>Stratos</last></author>
    <author><first>Michael</first><last>Collins</last></author>
    <author><first>Daniel</first><last>Hsu</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/837/192</href>
    <pages>245-257</pages>
    <url>http://www.aclweb.org/anthology/Q16-1018</url>
  </paper>

  <paper id='1019'>
    <title>ABCNN: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs</title>
    <author><first>Wenpeng</first><last>Yin</last></author>
    <author><first>Hinrich</first><last>Schütze</last></author>
    <author><first>Bing</first><last>Xiang</last></author>
    <author><first>Bowen</first><last>Zhou</last></author>
    <year>2016</year>
    <href>https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/download/831/194</href>
    <pages>259-272</pages>
    <url>http://www.aclweb.org/anthology/Q16-1019</url>
  </paper>

  <paper id='1020'>
    <title>Word Embeddings as Metric Recovery in Semantic Spaces</title>
    <author><first>Tatsunori</first><last>Hashimoto</last></author>
    <author><first>David</first><last>Alvarez-Melis</last></author>
    <author><first>Tommi</first><last>Jaakkola</last></author>
    <year>2016</year>
    <abstract>Continuous word representations have been remarkably useful across NLP tasks but remain poorly understood. We ground word embeddings in semantic spaces studied in the cognitive-psychometric literature, taking these spaces as the primary objects to recover. To this end, we relate log co-occurrences of words in large corpora to semantic similarity assessments and show that co-occurrences are indeed consistent with an Euclidean semantic space hypothesis. Framing word embedding as metric recovery of a semantic space unifies existing word embedding algorithms, ties them to manifold learning, and demonstrates that existing algorithms are consistent metric recovery methods given co-occurrence counts from random walks. Furthermore, we propose a simple, principled, direct metric recovery algorithm that performs on par with the state-of-the-art word embedding and manifold learning methods. Finally, we complement recent focus on analogies by constructing two new inductive reasoning datasets---series completion and classification---and demonstrate that word embeddings can be used to solve them as well.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/view/809/195</href>
    <pages>273--286</pages>
    <url>http://www.aclweb.org/anthology/Q16-1020</url>
  </paper>
  
  <paper id='1021'>
    <title>Comparing Apples to Apple: The Effects of Stemmers on Topic Models</title>
    <author><first>Alexandra</first><last>Schofield</last></author>
    <author><first>David</first><last>Mimno</last></author>
    <year>2016</year>
    <abstract>Rule-based stemmers such as the Porter stemmer are frequently used to preprocess English corpora for topic modeling. In this work, we train and evaluate topic models on a variety of corpora using several different stemming algorithms. We examine several different quantitative measures of the resulting models, including likelihood, coherence, model stability, and entropy. Despite their frequent use in topic modeling, we find that stemmers produce no meaningful improvement in likelihood and coherence and in fact can degrade topic stability.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/868/196</href>
    <pages>287--300</pages>
    <url>http://www.aclweb.org/anthology/Q16-1021</url>
  </paper>
  
  <paper id='1022'>
    <title>Multilingual Projection for Parsing Truly Low-Resource Languages</title>
    <author><first>Željko</first><last>Agić</last></author>
    <author><first>Anders</first><last>Johannsen</last></author>
    <author><first>Barbara</first><last>Plank</last></author>
    <author><first>Héctor</first><last>Martínez Alonso</last></author>
    <author><first>Natalie</first><last>Schluter</last></author>
    <author><first>Anders</first><last>Søgaard</last></author>
    <year>2016</year>
    <abstract>We propose a novel approach to cross-lingual part-of-speech tagging and dependency parsing for truly low-resource languages. Our annotation projection-based approach yields tagging and parsing models for over 100 languages. All that is needed are freely available parallel texts, and taggers and parsers for resource-rich languages. The empirical evaluation across 30 test languages shows that our method consistently provides top-level accuracies, close to established upper bounds, and outperforms several competitive baselines.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/869/197</href>
    <pages>301--312</pages>
    <url>http://www.aclweb.org/anthology/Q16-1022</url>
  </paper>
  
  <paper id='1023'>
    <title>Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations</title>
    <author><first>Eliyahu</first><last>Kiperwasser</last></author>
    <author><first>Yoav</first><last>Goldberg</last></author>
    <year>2016</year>
    <abstract>We present a simple and effective scheme for dependency parsing which is based on bidirectional-LSTMs (BiLSTMs). Each sentence token is associated with a BiLSTM vector representing the token in its sentential context, and feature vectors are constructed by concatenating a few BiLSTM vectors. The BiLSTM is trained jointly with the parser objective, resulting in very effective feature extractors for parsing. We demonstrate the effectiveness of the approach by applying it to a greedy transition-based parser as well as to a globally optimized graph-based parser. The resulting parsers have very simple architectures, and match or surpass the state-of-the-art accuracies on English and Chinese.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/885/198</href>
    <pages>313--327</pages>
    <url>http://www.aclweb.org/anthology/Q16-1023</url>
  </paper>
  
  <paper id='1024'>
    <title>Sparse Non-negative Matrix Language Modeling</title>
    <author><first>Joris</first><last>Pelemans</last></author>
    <author><first>Noam</first><last>Shazeer</last></author>
    <author><first>Ciprian</first><last>Chelba</last></author>
    <year>2016</year>
    <abstract>We present Sparse Non-negative Matrix (SNM) estimation, a novel probability estimation technique for language modeling that can efficiently incorporate arbitrary features. We evaluate SNM language models on two corpora: the One Billion Word Benchmark and a subset of the LDC English Gigaword corpus. Results show that SNM language models trained with n-gram features are a close match for the well-established Kneser-Ney models. The addition of skip-gram features yields a model that is in the same league as the state-of-the-art recurrent neural network language models, as well as complementary: combining the two modeling techniques yields the best known result on the One Billion Word Benchmark. On the Gigaword corpus further improvements are observed using features that cross sentence boundaries. The computational advantages of SNM estimation over both maximum entropy and neural network estimation are probably its main strength, promising an approach that has large flexibility in combining arbitrary features and yet scales gracefully to large amounts of data.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/561/199</href>
    <pages>329--342</pages>
    <url>http://www.aclweb.org/anthology/Q16-1024</url>
  </paper>
  
  <paper id='1025'>
    <title>Multi-lingual Dependency Parsing Evaluation: a Large-scale Analysis of Word Order Properties using Artificial Data</title>
    <author><first>Kristina</first><last>Gulordava</last></author>
    <author><first>Paola</first><last>Merlo</last></author>
    <year>2016</year>
    <abstract>The growing work in multi-lingual parsing faces the challenge of fair comparative evaluation and performance analysis across languages and their treebanks. The difficulty lies in teasing apart the properties of treebanks, such as their size or average sentence length, from those of the annotation scheme, and from the linguistic properties of languages. We propose a method to evaluate the effects of word order of a language on dependency parsing performance, while controlling for confounding treebank properties. The method uses artificially-generated treebanks that are minimal permutations of actual treebanks with respect to two word order properties: word order variation and dependency lengths. Based on these artificial data on twelve languages, we show that longer dependencies and higher word order variability degrade parsing performance. Our method also extends to minimal pairs of individual sentences, leading to a finer-grained understanding of parsing errors.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/870/201</href>
    <pages>343--356</pages>
    <url>http://www.aclweb.org/anthology/Q16-1025</url>
  </paper>
  
  <paper id='1026'>
    <title>Named Entity Recognition with Bidirectional LSTM-CNNs</title>
    <author><first>Jason</first><last>Chiu</last></author>
    <author><first>Eric</first><last>Nichols</last></author>
    <year>2016</year>
    <abstract>Named entity recognition is a challenging task that has traditionally required large amounts of knowledge in the form of feature engineering and lexicons to achieve high performance. In this paper, we present a novel neural network architecture that automatically detects word- and character-level features using a hybrid bidirectional LSTM and CNN architecture, eliminating the need for most feature engineering. We also propose a novel method of encoding partial lexicon matches in neural networks and compare it to existing approaches. Extensive evaluation shows that, given only tokenized text and publicly available word embeddings, our system is competitive on the CoNLL-2003 dataset and surpasses the previously reported state of the art performance on the OntoNotes 5.0 dataset by 2.13 F1 points. By using two lexicons constructed from publicly-available sources, we establish new state of the art performance with an F1 score of 91.62 on CoNLL-2003 and 86.28 on OntoNotes, surpassing systems that employ heavy feature engineering, proprietary lexicons, and rich entity linking information.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/792/202</href>
    <pages>357--370</pages>
    <url>http://www.aclweb.org/anthology/Q16-1026</url>
  </paper>
  
  <paper id='1027'>
    <title>Deep Recurrent Models with Fast-Forward Connections for Neural Machine Translation</title>
    <author><first>Jie</first><last>Zhou</last></author>
    <author><first>Ying</first><last>Cao</last></author>
    <author><first>Xuguang</first><last>Wang</last></author>
    <author><first>Peng</first><last>Li</last></author>
    <author><first>Wei</first><last>Xu</last></author>
    <year>2016</year>
    <abstract>Neural machine translation (NMT) aims at solving machine translation (MT) problems using neural networks and has exhibited promising results in recent years. However, most of the existing NMT models are shallow and there is still a performance gap between a single NMT model and the best conventional MT system.   In this work, we introduce a new type of linear connections, named fast-forward connections, based on deep Long Short-Term Memory (LSTM) networks, and an interleaved bi-directional architecture for stacking the LSTM layers.  Fast-forward connections play an essential role in propagating the gradients and building a deep topology of depth 16. On the WMT'14 English-to-French task, we achieve BLEU=37.7 with a single attention model, which outperforms the corresponding single shallow model by 6.2 BLEU points. This is the first time that a single NMT model achieves state-of-the-art performance and outperforms the best conventional model by 0.7 BLEU points. We can still achieve BLEU=36.3 even without using an attention mechanism. After special handling of unknown words and model ensembling, we obtain the best score reported to date on this task with BLEU=40.4. Our models are also validated on the more difficult WMT'14 English-to-German task.</abstract>
    <href>https://transacl.org/ojs/index.php/tacl/article/download/863/203</href>
    <pages>371--383</pages>
    <url>http://www.aclweb.org/anthology/Q16-1027</url>
  </paper>

  <paper id='1028'>
    <title>A Latent Variable Model Approach to PMI-based Word Embeddings</title>
    <author><first>Sanjeev</first><last>Arora</last></author>
    <author><first>Yuanzhi</first><last>Li</last></author>
    <author><first>Yingyu</first><last>Liang</last></author>
    <author><first>Tengyu</first><last>Ma</last></author>
    <author><first>Andrej</first><last>Risteski</last></author>
    <year>2016</year>
    <abstract>Semantic word embeddings represent the meaning of a word via a vector, and are created by diverse methods. Many use nonlinear operations on co-occurrence statistics, and have hand-tuned hyperparameters and reweighting methods.This paper proposes a new generative model, a dynamic version of the log-linear topic model of Mnih and Hinton (2007). The methodological novelty is to use the prior to compute closed form expressions for word statistics. This provides a theoretical justification for nonlinear models like PMI, word2vec, and GloVe, as well as some hyperparameter choices. It also helps explain why low-dimensional semantic embeddings contain linear algebraic structure that allows solution of word analogies, as shown by Mikolov et al. (2013a) and many subsequent papers.Experimental support is provided for the generative model assumptions, the most important of which is that latent word vectors are fairly uniformly dispersed in space.</abstract>
    <pages>385--399</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/742/204</href>
    <url>http://www.aclweb.org/anthology/Q16-1028</url>
  </paper>
  
  <paper id='1029'>
    <title>Optimizing Statistical Machine Translation for Text Simplification</title>
    <author><first>Wei</first><last>Xu</last></author>
    <author><first>Courtney</first><last>Napoles</last></author>
    <author><first>Ellie</first><last>Pavlick</last></author>
    <author><first>Quanze</first><last>Chen</last></author>
    <author><first>Chris</first><last>Callison-Burch</last></author>
    <year>2016</year>
    <abstract>Most recent sentence simplification systems use basic machine translation models to learn lexical and syntactic paraphrases from a manually simplified parallel corpus. These methods are limited by the quality and quantity of manually simplified corpora, which are expensive to build. In this paper, we conduct an in-depth adaptation of statistical machine translation to perform text simplification, taking advantage of large-scale paraphrases learned from bilingual texts and a small amount of manual simplifications with multiple references. Our work is the first to design automatic metrics that are effective for tuning and evaluating simplification systems, which will facilitate iterative development for this task.</abstract>
    <pages>401--415</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/741/205</href>
    <url>http://www.aclweb.org/anthology/Q16-1029</url>
  </paper>
  
  <paper id='1030'>
    <title>Encoding Prior Knowledge with Eigenword Embeddings</title>
    <author><first>Dominique</first><last>Osborne</last></author>
    <author><first>Shashi</first><last>Narayan</last></author>
    <author><first>Shay</first><last>Cohen</last></author>
    <year>2016</year>
    <abstract>Canonical correlation analysis (CCA) is a method for reducing the dimension of data represented using two views. It has been previously used to derive word embeddings, where one view indicates a word, and the other view indicates its context. We describe a way to incorporate prior knowledge into CCA, give a theoretical justification for it, and test it by deriving word embeddings and evaluating them on a myriad of datasets.</abstract>
    <pages>417--430</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/895/206</href>
    <url>http://www.aclweb.org/anthology/Q16-1030</url>
  </paper>
  
  <paper id='1031'>
    <title>Many Languages, One Parser</title>
    <author><first>Waleed</first><last>Ammar</last></author>
    <author><first>George</first><last>Mulcaire</last></author>
    <author><first>Miguel</first><last>Ballesteros</last></author>
    <author><first>Chris</first><last>Dyer</last></author>
    <author><first>Noah</first><last>Smith</last></author>
    <year>2016</year>
    <abstract>We train one multilingual model for dependency parsing and use it to parse sentences in several languages. The parsing model uses (i) multilingual word clusters and embeddings; (ii) token-level language information; and (iii) language-specific features (fine-grained POS tags). This input representation enables the parser not only to parse effectively in multiple languages, but also to generalize across languages based on linguistic universals and typological similarities, making it more effective to learn from limited annotations. Our parser's performance compares favorably to strong baselines in a range of data scenarios, including when the target language has a large treebank, a small treebank, or no treebank for training.</abstract>
    <pages>431--444</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/892/207</href>
    <url>http://www.aclweb.org/anthology/Q16-1031</url>
  </paper>
  
  <paper id='1032'>
    <title>Easy-First Dependency Parsing with Hierarchical Tree LSTMs</title>
    <author><first>Eliyahu</first><last>Kiperwasser</last></author>
    <author><first>Yoav</first><last>Goldberg</last></author>
    <year>2016</year>
    <abstract>We suggest a compositional vector representation of parse trees that relies on a recursive combination of recurrent-neural network encoders. To demonstrate its effectiveness, we use the representation as the backbone of a greedy, bottom-up dependency parser, achieving very strong accuracies for English and Chinese, without relying on external word embeddings. The parser’s  implementation is available for download at the first author’s webpage.</abstract>
    <pages>445--461</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/798/208</href>
    <url>http://www.aclweb.org/anthology/Q16-1032</url>
  </paper>
  
  <paper id='1033'>
    <title>Large-scale Analysis of Counseling Conversations: An Application of Natural Language Processing to Mental Health</title>
    <author><first>Tim</first><last>Althoff</last></author>
    <author><first>Kevin</first><last>Clark</last></author>
    <author><first>Jure</first><last>Leskovec</last></author>
    <year>2016</year>
    <abstract>Mental illness is one of the most pressing public health issues of our time. While counseling and psychotherapy can be effective treatments, our knowledge about how to conduct successful counseling conversations has been limited due to lack of large-scale data with labeled outcomes of the conversations. In this paper, we present a large-scale, quantitative study on the discourse of text-message-based counseling conversations. We develop a set of novel computational discourse analysis methods to measure how various linguistic aspects of conversations are correlated with conversation outcomes. Applying techniques such as sequence-based conversation models, language model comparisons, message clustering, and psycholinguistics-inspired word frequency analyses, we discover actionable conversation strategies that are associated with better conversation outcomes.</abstract>
    <pages>463--476</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/802/209</href>
    <url>http://www.aclweb.org/anthology/Q16-1033</url>
  </paper>

  <paper id='1034'>
    <title>Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees</title>
    <author><first>Ehsan</first><last>Shareghi</last></author>
    <author><first>Matthias</first><last>Petri</last></author>
    <author><first>Gholamreza</first><last>Haffari</last></author>
    <author><first>Trevor</first><last>Cohn</last></author>
    <year>2016</year>
    <abstract>Efficient methods for storing and querying are critical for scaling high-order m-gram language models to large corpora. We propose a language model based on compressed suffix trees, a representation that is highly compact and can be easily held in memory, while supporting queries needed in computing language model probabilities on-the-fly. We present several optimizations which improve query runtimes up to 2500x, despite only incurring a modest increase in construction time and memory usage. For large corpora and high Markov orders, our method is highly competitive with the state-of-the-art KenLM package. It imposes much lower memory requirements, often by orders of magnitude, and has runtimes that are either similar (for training) or comparable (for querying).</abstract>
    <pages>477--490</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/865/211</href>
    <url>http://www.aclweb.org/anthology/Q16-1034</url>
  </paper>

  <paper id='1035'>
    <title>The Galactic Dependencies Treebanks: Getting More Data by Synthesizing New Languages</title>
    <author><first>Dingquan</first><last>Wang</last></author>
    <author><first>Jason</first><last>Eisner</last></author>
    <year>2016</year>
    <abstract>We release Galactic Dependencies 1.0—a large set of synthetic languages not found on Earth, but annotated in Universal Dependencies format. This new resource aims to provide training and development data for NLP methods that aim to adapt to unfamiliar languages. Each synthetic treebank is produced from a real treebank by stochastically permuting the dependents of nouns and/or verbs to match the word order of other real languages. We discuss the usefulness, realism, parsability, perplexity, and diversity of the synthetic languages. As a simple demonstration of the use of Galactic Dependencies, we consider single-source transfer, which attempts to parse a real target language using a parser trained on a “nearby” source language. We find that including synthetic source languages somewhat increases the diversity of the source pool, which significantly improves results for most target languages.</abstract>
    <pages>491--505</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/917/212</href>
    <url>http://www.aclweb.org/anthology/Q16-1035</url>
  </paper>

  <paper id='1036'>
    <title>Minimally Supervised Number Normalization</title>
    <author><first>Kyle</first><last>Gorman</last></author>
    <author><first>Richard</first><last>Sproat</last></author>
    <year>2016</year>
    <abstract>We propose two models for verbalizing numbers, a key component in speech recognition and synthesis systems. The first model uses an end-to-end recurrent neural network. The second model, drawing inspiration from the linguistics literature, uses finite-state transducers constructed with a minimal amount of training data. While both models achieve near-perfect performance, the latter model can be trained using several orders of magnitude less data than the former, making it particularly useful for low-resource languages.</abstract>
    <pages>507--519</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/897/213</href>
    <url>http://www.aclweb.org/anthology/Q16-1036</url>
  </paper>

  <paper id='1037'>
    <title>Assessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies</title>
    <author><first>Tal</first><last>Linzen</last></author>
    <author><first>Emmanuel</first><last>Dupoux</last></author>
    <author><first>Yoav</first><last>Goldberg</last></author>
    <year>2016</year>
    <abstract>The success of long short-term memory (LSTM) neural networks in language processing is typically attributed to their ability to capture long-distance statistical regularities. Linguistic regularities are often sensitive to syntactic structure; can such dependencies be captured by LSTMs, which do not have explicit structural representations? We begin addressing this question using number agreement in English subject-verb dependencies. We probe the architecture's grammatical competence both using training objectives with an explicit grammatical target (number prediction, grammaticality judgments) and using language models. In the strongly supervised settings, the LSTM achieved very high overall accuracy (less than 1% errors), but errors increased when sequential and structural information conflicted. The frequency of such errors rose sharply in the language-modeling setting. We conclude that LSTMs can capture a non-trivial amount of grammatical structure given targeted supervision, but stronger architectures may be required to further reduce errors; furthermore, the language modeling signal is insufficient for capturing syntax-sensitive dependencies, and should be supplemented with more direct supervision if such dependencies need to be captured.</abstract>
    <pages>521--535</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/972/215</href>
    <url>http://www.aclweb.org/anthology/Q16-1037</url>
  </paper>

  <paper id='1038'>
    <title>Understanding Satirical Articles Using Common-Sense</title>
    <author><first>Dan</first><last>Goldwasser</last></author>
    <author><first>Xiao</first><last>Zhang</last></author>
    <year>2016</year>
    <abstract>Automatic satire detection is a subtle text classification task, for machines and at times, even for humans. In this paper we argue that satire detection should be approached using common-sense inferences, rather than traditional word classification methods. We present a highly structured latent variable model capturing the required inferences.  The model abstracts over the specific entities appearing in the articles, grouping them into generalized categories thus allowing the model to adapt to previously unseen situations.</abstract>
    <pages>537--549</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/</href>
    <url>http://www.aclweb.org/anthology/Q16-1038</url>
  </paper>

  <paper id='1039'>
    <title>Utilizing Temporal Information for Taxonomy Construction</title>
    <author><first>Anh Tuan</first><last>Luu</last></author>
    <author><first>Siu Cheung</first><last>Hui</last></author>
    <author><first>See Kiong</first><last>Ng</last></author>
    <year>2016</year>
    <abstract>Taxonomies play an important role in many applications by organizing domain knowledge into a hierarchy of ‘is-a' relations between terms. Previous work on automatic construction of taxonomies from text documents either ignored temporal information or used fixed time periods to discretize the time series of documents. In this paper, we propose a time-aware method to automatically construct and effectively maintain a taxonomy from a given series of documents pre-clustered for a domain of interest. The method extracts temporal information from the documents and uses a timestamp contribution function to score the temporal relevance of the evidence from source texts when identifying the taxonomic relations for constructing the taxonomy. Experimental results show that our proposed method outperforms the state-of-the-art methods by increasing F-measure up to 7%-20%. Furthermore, the proposed method can incrementally update the taxonomy by adding fresh relations from new data and removing outdated relations using an information decay function. It thus avoids rebuilding the whole taxonomy from scratch for every update and keeps the taxonomy effectively up-to-date in order to track the latest information trends in the rapidly evolving domain.</abstract>
    <pages>551--564</pages>
    <href>https://www.transacl.org/ojs/index.php/tacl/article/download/</href>
    <url>http://www.aclweb.org/anthology/Q16-1039</url>
  </paper>
  
</volume>
  