<?xml version="1.0" encoding="UTF-8" ?>
<volume id="W17">
  <paper id="5000">
    <title>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</title>
    <editor>Joel Tetreault</editor>
    <editor>Jill Burstein</editor>
    <editor>Claudia Leacock</editor>
    <editor>Helen Yannakoudakis</editor>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <url>http://www.aclweb.org/anthology/W17-50</url>
    <bibtype>book</bibtype>
    <bibkey>BEA:2017</bibkey>
  </paper>

  <paper id="5001">
    <title>Question Difficulty &#8211; How to Estimate Without Norming, How to Use for Automated Grading</title>
    <author><first>Ulrike</first><last>Pado</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>1&#8211;10</pages>
    <url>http://www.aclweb.org/anthology/W17-5001</url>
    <abstract>Question difficulty estimates guide test creation, but are too costly for
	small-scale testing. We empirically verify that Bloom's Taxonomy, a standard
	tool for difficulty estimation during question creation, reliably predicts
	question difficulty observed after testing in a short-answer corpus. We also
	find that difficulty is mirrored in the amount of variation in student answers,
	which can be computed before grading.
	We show that question difficulty and its approximations are useful for
	ėxtit{automated grading}, allowing us to identify the optimal feature set for
	grading each question even in an unseen-question setting.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>pado:2017:BEA</bibkey>
  </paper>

  <paper id="5002">
    <title>Combining CNNs and Pattern Matching for Question Interpretation in a Virtual Patient Dialogue System</title>
    <author><first>Lifeng</first><last>Jin</last></author>
    <author><first>Michael</first><last>White</last></author>
    <author><first>Evan</first><last>Jaffe</last></author>
    <author><first>Laura</first><last>Zimmerman</last></author>
    <author><first>Douglas</first><last>Danforth</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>11&#8211;21</pages>
    <url>http://www.aclweb.org/anthology/W17-5002</url>
    <abstract>For medical students, virtual patient dialogue systems can provide useful
	training opportunities without the cost of employing actors to portray
	standardized patients.              This work utilizes word- and
	character-based convolutional neural networks (CNNs) for question
	identification in a virtual
	patient dialogue system, outperforming a strong word- and character-based
	logistic regression baseline.  While the CNNs perform well given sufficient
	training data, the best system performance is ultimately achieved by combining
	CNNs with a hand-crafted pattern matching system that is robust to label
	sparsity, providing a 10% boost in system accuracy and an error reduction of
	47% as compared to the pattern-matching system alone.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>jin-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5003">
    <title>Continuous fluency tracking and the challenges of varying text complexity</title>
    <author><first>Beata</first><last>Beigman Klebanov</last></author>
    <author><first>Anastassia</first><last>Loukina</last></author>
    <author><first>John</first><last>Sabatini</last></author>
    <author><first>Tenaha</first><last>O'Reilly</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>22&#8211;32</pages>
    <url>http://www.aclweb.org/anthology/W17-5003</url>
    <abstract>This paper is a preliminary report on using text complexity measurement in the
	service of a new educational application. We describe a reading intervention
	where a child takes turns reading a book aloud with a virtual reading partner.
	Our ultimate goal is to provide meaningful feedback to the parent or the
	teacher by continuously tracking the child's improvement in reading fluency. We
	show that this would not be a simple endeavor, due to an intricate relationship
	between text complexity from the point of view of comprehension and reading
	rate.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>beigmanklebanov-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5004">
    <title>Auxiliary Objectives for Neural Error Detection Models</title>
    <author><first>Marek</first><last>Rei</last></author>
    <author><first>Helen</first><last>Yannakoudakis</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>33&#8211;43</pages>
    <url>http://www.aclweb.org/anthology/W17-5004</url>
    <abstract>We investigate the utility of different auxiliary objectives and training
	strategies within a neural sequence labeling approach to error detection in
	learner writing. 
	Auxiliary costs provide the model with additional linguistic information,
	allowing it to learn general-purpose compositional features that can then be
	exploited for other objectives.
	Our experiments show that a joint learning approach trained with parallel
	labels on in-domain data improves performance over the previous best error
	detection system. 
	While the resulting model has the same number of parameters, the additional
	objectives allow it to be optimised more efficiently and achieve better
	performance.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>rei-yannakoudakis:2017:BEA</bibkey>
  </paper>

  <paper id="5005">
    <title>Linked Data for Language-Learning Applications</title>
    <author><first>Robyn</first><last>Loughnane</last></author>
    <author><first>Kate</first><last>McCurdy</last></author>
    <author><first>Peter</first><last>Kolb</last></author>
    <author><first>Stefan</first><last>Selent</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>44&#8211;51</pages>
    <url>http://www.aclweb.org/anthology/W17-5005</url>
    <abstract>The use of linked data within language-learning applications is an open
	research question. A research prototype is presented that applies linked-data
	principles to store linguistic annotation generated from language-learning
	content using a variety of NLP tools. The result is a database that links
	learning content, linguistic annotation and open-source resources, on top of
	which a diverse range of tools for language-learning applications can be built.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>loughnane-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5006">
    <title>Predicting Specificity in Classroom Discussion</title>
    <author><first>Luca</first><last>Lugini</last></author>
    <author><first>Diane</first><last>Litman</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>52&#8211;61</pages>
    <url>http://www.aclweb.org/anthology/W17-5006</url>
    <abstract>High quality classroom discussion is important to student development,
	enhancing
	abilities to express claims, reason about other students’ claims, and retain
	information for longer periods of time. Previous small-scale studies have shown
	that one indicator of classroom discussion quality is specificity. In this
	paper we tackle the problem of predicting specificity for classroom
	discussions. We propose several methods and feature sets capable of
	outperforming the state of the art in specificity prediction. Additionally, we
	provide a set of meaningful, interpretable features that can be used to analyze
	classroom discussions at a pedagogical level.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>lugini-litman:2017:BEA</bibkey>
  </paper>

  <paper id="5007">
    <title>A Report on the 2017 Native Language Identification Shared Task</title>
    <author><first>Shervin</first><last>Malmasi</last></author>
    <author><first>Keelan</first><last>Evanini</last></author>
    <author><first>Aoife</first><last>Cahill</last></author>
    <author><first>Joel</first><last>Tetreault</last></author>
    <author><first>Robert</first><last>Pugh</last></author>
    <author><first>Christopher</first><last>Hamill</last></author>
    <author><first>Diane</first><last>Napolitano</last></author>
    <author><first>Yao</first><last>Qian</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>62&#8211;75</pages>
    <url>http://www.aclweb.org/anthology/W17-5007</url>
    <abstract>Native Language Identification (NLI) is the task of automatically identifying
	the native language (L1) of an individual based on their language production in
	a learned language. It is typically framed as a classification task where the
	set of L1s is known a priori. Two previous shared tasks on NLI have been
	organized where the aim was to identify the L1 of learners of English based on
	essays (2013) and spoken responses (2016) they provided during a standardized
	assessment of academic English proficiency. The 2017 shared task combines the
	inputs from the two prior tasks for the first time. There are three tracks: NLI
	on the essay only, NLI on the spoken response only (based on a transcription of
	the response and i-vector acoustic features), and NLI using both responses. We
	believe this makes for a more interesting shared task while building on the
	methods and results from the previous two shared tasks. In this paper, we
	report the results of the shared task. A total of 19 teams competed across the
	three different sub-tasks. The fusion track showed that combining the written
	and spoken responses provides a large boost in prediction accuracy. Multiple
	classifier systems (e.g. ensembles and meta-classifiers) were the most
	effective in all tasks, with most based on traditional classifiers (e.g. SVMs)
	with lexical/syntactic features.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>malmasi-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5008">
    <title>Evaluation of Automatically Generated Pronoun Reference Questions</title>
    <author><first>Arief Yudha</first><last>Satria</last></author>
    <author><first>Takenobu</first><last>Tokunaga</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>76&#8211;85</pages>
    <url>http://www.aclweb.org/anthology/W17-5008</url>
    <abstract>This study provides a detailed analysis of evaluation of English pronoun
	reference questions which are created automatically by machine. Pronoun
	reference questions are multiple choice questions that ask test takers to
	choose an antecedent of a target pronoun in a reading passage from four
	options. The evaluation was performed from two perspectives: the perspective of
	English teachers and that of English learners. Item analysis suggests that
	machine-generated questions achieve comparable quality with human-made
	questions. Correlation analysis revealed a strong correlation between the
	scores of machine-generated questions and that of human-made questions.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>satria-tokunaga:2017:BEA</bibkey>
  </paper>

  <paper id="5009">
    <title>Predicting Audience's Laughter During Presentations Using Convolutional Neural Network</title>
    <author><first>Lei</first><last>Chen</last></author>
    <author><first>Chong Min</first><last>Lee</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>86&#8211;90</pages>
    <url>http://www.aclweb.org/anthology/W17-5009</url>
    <abstract>Public speakings play important roles in schools and work places and properly
	using humor contributes to effective presentations. For the purpose of
	automatically evaluating speakers' humor usage, we build a presentation corpus
	containing humorous utterances based on TED talks. Compared to previous data
	resources supporting humor recognition research, ours has several advantages,
	including (a) both positive and negative instances coming from a homogeneous
	data set, (b) containing a large number of speakers, and (c) being open.
	Focusing on using lexical cues for humor recognition, we systematically compare
	a newly emerging text classification method based on Convolutional Neural
	Networks (CNNs) with a well-established conventional method using linguistic
	knowledge. The advantages of the CNN method are both getting higher detection
	accuracies and being able to learn essential features automatically.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>chen-lee:2017:BEA</bibkey>
  </paper>

  <paper id="5010">
    <title>Collecting fluency corrections for spoken learner English</title>
    <author><first>Andrew</first><last>Caines</last></author>
    <author><first>Emma</first><last>Flint</last></author>
    <author><first>Paula</first><last>Buttery</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>91&#8211;100</pages>
    <url>http://www.aclweb.org/anthology/W17-5010</url>
    <abstract>We present crowdsourced collection of error annotations for transcriptions of
	spoken learner English. Our emphasis in data collection is on fluency
	corrections, a more complete correction than has traditionally been aimed for
	in grammatical error correction research (GEC). Fluency corrections require
	improvements to the text, taking discourse and utterance level semantics into
	account: the result is a more naturalistic, holistic version of the original.
	We propose that this shifted emphasis be reflected in a new name for the task:
	'holistic error correction' (HEC). We analyse crowdworker behaviour in HEC and
	conclude that the method is useful with certain amendments for future work.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>caines-flint-buttery:2017:BEA</bibkey>
  </paper>

  <paper id="5011">
    <title>Exploring Relationships Between Writing &#38; Broader Outcomes With Automated Writing Evaluation</title>
    <author><first>Jill</first><last>Burstein</last></author>
    <author><first>Dan</first><last>McCaffrey</last></author>
    <author><first>Beata</first><last>Beigman Klebanov</last></author>
    <author><first>Guangming</first><last>Ling</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>101&#8211;108</pages>
    <url>http://www.aclweb.org/anthology/W17-5011</url>
    <abstract>Writing is a challenge, especially for at-risk students who may lack the
	prerequisite writing skills required to persist in U.S. 4-year postsecondary
	(college) institutions. Educators teaching postsecondary courses requiring
	writing could benefit from a better understanding of writing achievement and
	its role in postsecondary success. In this paper, novel exploratory work
	examined how automated writing evaluation (AWE) can inform our understanding of
	the relationship between postsecondary writing skill and broader success
	outcomes. An exploratory study was conducted using test-taker essays from a
	standardized writing assessment of postsecondary student learning outcomes.
	Findings showed that for the essays, AWE features were found to be predictors
	of broader outcomes measures: college success and learning outcomes measures.
	Study findings illustrate AWE’s potential to support educational analytics &#8211;
	i.e., relationships between writing skill and broader outcomes &#8211; taking a
	step toward moving AWE beyond writing assessment and instructional use cases.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>burstein-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5012">
    <title>An Investigation into the Pedagogical Features of Documents</title>
    <author><first>Emily</first><last>Sheng</last></author>
    <author><first>Prem</first><last>Natarajan</last></author>
    <author><first>Jonathan</first><last>Gordon</last></author>
    <author><first>Gully</first><last>Burns</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>109&#8211;120</pages>
    <url>http://www.aclweb.org/anthology/W17-5012</url>
    <abstract>Characterizing the content of a technical document in terms of its learning
	utility can be useful for applications related to education, such as generating
	reading lists from large collections of documents. We refer to this learning
	utility as the &#x201c;pedagogical value&#x201d; of the document to the learner. While
	pedagogical value is an important concept that has been studied extensively
	within the education domain, there has been little work exploring it from a
	computational, i.e., natural language processing (NLP), perspective. To allow a
	computational exploration of this concept, we introduce the notion of
	&#x201c;pedagogical roles&#x201d; of documents (e.g., Tutorial and Survey) as an
	intermediary component for the study of pedagogical value. Given the lack of
	available corpora for our exploration, we create the first annotated corpus of
	pedagogical roles and use it to test baseline techniques for automatic
	prediction of such roles.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>sheng-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5013">
    <title>Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities</title>
    <author><first>Victoria</first><last>Yaneva</last></author>
    <author><first>Constantin</first><last>Orasan</last></author>
    <author><first>Richard</first><last>Evans</last></author>
    <author><first>Omid</first><last>Rohanian</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>121&#8211;132</pages>
    <url>http://www.aclweb.org/anthology/W17-5013</url>
    <abstract>Given the lack of large user-evaluated corpora in disability-related NLP
	research (e.g. text simplification or readability assessment for people with
	cognitive disabilities), the question of choosing suitable training data for
	NLP models is not straightforward. The use of large generic corpora may be
	problematic because such data may not reflect the needs of the target
	population. The use of the available user-evaluated corpora may be problematic
	because these datasets are not large enough to be used as training data. In
	this paper we explore a third approach, in which a large generic corpus is
	combined with a smaller population-specific corpus to train a classifier which
	is
	evaluated using two sets of unseen user-evaluated data. One of these sets, the
	ASD Comprehension corpus, is developed for the purposes of this study and made
	freely available. We explore the effects of the size and type of the training
	data used on the performance of the classifiers, and the effects of the type of
	the unseen test datasets on the classification performance.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>yaneva-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5014">
    <title>Automatic Extraction of High-Quality Example Sentences for Word Learning Using a Determinantal Point Process</title>
    <author><first>Arseny</first><last>Tolmachev</last></author>
    <author><first>Sadao</first><last>Kurohashi</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>133&#8211;142</pages>
    <url>http://www.aclweb.org/anthology/W17-5014</url>
    <attachment type="attachment">W17-5014.Attachment.zip</attachment>
    <abstract>Flashcard systems are effective tools for learning words but have their
	limitations in teaching word usage. To overcome this problem, we propose a
	novel flashcard system that 
	shows a new example sentence on each repetition. This extension requires
	high-quality example sentences, automatically extracted from a huge corpus. To
	do this, we use a Determinantal Point Process which scales well to large data
	and allows to naturally represent sentence similarity and quality as features.
	Our human evaluation experiment on Japanese language indicates that the
	proposed method successfully extracted high-quality example sentences.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>tolmachev-kurohashi:2017:BEA</bibkey>
  </paper>

  <paper id="5015">
    <title>Distractor Generation for Chinese Fill-in-the-blank Items</title>
    <author><first>Shu</first><last>Jiang</last></author>
    <author><first>John</first><last>Lee</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>143&#8211;148</pages>
    <url>http://www.aclweb.org/anthology/W17-5015</url>
    <abstract>This paper reports the first study on automatic generation of distractors for
	fill-in-the-blank items for learning Chinese vocabulary.  We investigate the
	quality of distractors generated by a number of criteria, including
	part-of-speech, difficulty level, spelling, word co-occurrence and semantic
	similarity.  Evaluations show that a semantic similarity measure, based on the
	word2vec model, yields distractors that are significantly more plausible than
	those generated by baseline methods.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>jiang-lee:2017:BEA</bibkey>
  </paper>

  <paper id="5016">
    <title>An Error-Oriented Approach to Word Embedding Pre-Training</title>
    <author><first>Youmna</first><last>Farag</last></author>
    <author><first>Marek</first><last>Rei</last></author>
    <author><first>Ted</first><last>Briscoe</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>149&#8211;158</pages>
    <url>http://www.aclweb.org/anthology/W17-5016</url>
    <abstract>We propose a novel word embedding pre-training approach that exploits writing
	errors in learners' scripts. We compare our method to previous models that tune
	the embeddings based on script scores and the discrimination between correct
	and corrupt word contexts in addition to the generic commonly-used embeddings
	pre-trained on large corpora. The comparison is achieved by using the
	aforementioned models to bootstrap a neural network that learns to predict a
	holistic score for scripts. Furthermore, we investigate augmenting our model
	with error corrections and monitor the impact on performance. Our results show
	that our error-oriented approach outperforms other comparable ones which is
	further demonstrated when training on more data. Additionally, extending the
	model with corrections provides further performance gains when data sparsity is
	an issue.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>farag-rei-briscoe:2017:BEA</bibkey>
  </paper>

  <paper id="5017">
    <title>Investigating neural architectures for short answer scoring</title>
    <author><first>Brian</first><last>Riordan</last></author>
    <author><first>Andrea</first><last>Horbach</last></author>
    <author><first>Aoife</first><last>Cahill</last></author>
    <author><first>Torsten</first><last>Zesch</last></author>
    <author><first>Chong Min</first><last>Lee</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>159&#8211;168</pages>
    <url>http://www.aclweb.org/anthology/W17-5017</url>
    <abstract>Neural approaches to automated essay scoring have recently shown
	state-of-the-art performance. The automated essay scoring task typically
	involves a broad notion of writing quality that encompasses content, grammar,
	organization, and conventions. This differs from the short answer content
	scoring task, which focuses on content accuracy. The inputs to neural essay
	scoring models &#8211; ngrams and embeddings &#8211; are arguably well-suited to evaluate
	content in short answer scoring tasks. We investigate how several basic neural
	approaches similar to those used for automated essay scoring perform on short
	answer scoring. We show that neural architectures can outperform a strong
	non-neural baseline, but performance and optimal parameter settings vary across
	the more diverse types of prompts typical of short answer scoring.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>riordan-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5018">
    <title>Human and Automated CEFR-based Grading of Short Answers</title>
    <author><first>Ana&#239;s</first><last>Tack</last></author>
    <author><first>Thomas</first><last>Fran&#231;ois</last></author>
    <author><first>Sophie</first><last>Roekhaut</last></author>
    <author><first>C&#233;drick</first><last>Fairon</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>169&#8211;179</pages>
    <url>http://www.aclweb.org/anthology/W17-5018</url>
    <abstract>This paper is concerned with the task of automatically assessing the written
	proficiency level of non-native (L2) learners of English.
	Drawing on previous research on automated L2 writing assessment following the
	Common European Framework of Reference for Languages (CEFR), we investigate the
	possibilities and difficulties of deriving the CEFR level from short answers to
	open-ended questions, which has not yet been subjected to numerous studies up
	to date.
	The object of our study is twofold: to examine the intricacy involved with both
	human and automated CEFR-based grading of short answers.
	On the one hand, we describe the compilation of a learner corpus of short
	answers graded with CEFR levels by three certified Cambridge examiners.
	We mainly observe that, although the shortness of the answers is reported as
	undermining a clear-cut evaluation, the length of the answer does not
	necessarily correlate with inter-examiner disagreement.
	On the other hand, we explore the development of a soft-voting system for the
	automated CEFR-based grading of short answers and draw tentative conclusions
	about its use in a computer-assisted testing (CAT) setting.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>tack-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5019">
    <title>GEC into the future: Where are we going and how do we get there?</title>
    <author><first>Keisuke</first><last>Sakaguchi</last></author>
    <author><first>Courtney</first><last>Napoles</last></author>
    <author><first>Joel</first><last>Tetreault</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>180&#8211;187</pages>
    <url>http://www.aclweb.org/anthology/W17-5019</url>
    <abstract>The field of grammatical error correction (GEC) has made tremendous bounds in
	the last ten years, but new questions and obstacles are revealing themselves.
	In this position paper, we discuss the issues that need to be addressed and
	provide recommendations for the field to continue to make progress, and propose
	a new shared task. We invite suggestions and critiques from the audience to
	make the new shared task a community-driven venture.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>sakaguchi-napoles-tetreault:2017:BEA</bibkey>
  </paper>

  <paper id="5020">
    <title>Detecting Off-topic Responses to Visual Prompts</title>
    <author><first>Marek</first><last>Rei</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>188&#8211;197</pages>
    <url>http://www.aclweb.org/anthology/W17-5020</url>
    <abstract>Automated methods for essay scoring have made great progress in recent years,
	achieving accuracies very close to human annotators. 
	However, a known weakness of such automated scorers is not taking into account
	the semantic relevance of the submitted text.
	While there is existing work on detecting answer relevance given a textual
	prompt, very little previous research has been done to incorporate visual
	writing prompts.
	We propose a neural architecture and several extensions for detecting off-topic
	responses to visual prompts and evaluate it on a dataset of texts written by
	language learners.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>rei:2017:BEA</bibkey>
  </paper>

  <paper id="5021">
    <title>Combining Textual and Speech Features in the NLI Task Using State-of-the-Art Machine Learning Techniques</title>
    <author><first>Pavel</first><last>Ircing</last></author>
    <author><first>Jan</first><last>Svec</last></author>
    <author><first>Zbynek</first><last>Zajic</last></author>
    <author><first>Barbora</first><last>Hladka</last></author>
    <author><first>Martin</first><last>Holub</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>198&#8211;209</pages>
    <url>http://www.aclweb.org/anthology/W17-5021</url>
    <abstract>We summarize the involvement of our CEMI team in the &#x201d;NLI Shared Task 2017&#x201d;,
	which deals with both textual and speech input data. We submitted the results
	achieved by using three different system architectures; each of them combines
	multiple supervised learning models trained on various feature sets. As
	expected, better results are achieved with the systems that use both the
	textual data and the spoken responses. Combining the input data of two
	different modalities led to a rather dramatic improvement in classification
	performance. 
	Our best performing method is based on a set of feed-forward neural networks
	whose hidden-layer outputs are combined together using a softmax layer. We
	achieved a macro-averaged F1 score of 0.9257 on the evaluation (unseen) test
	set and our team placed first in the main task together with other three teams.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>ircing-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5022">
    <title>Native Language Identification Using a Mixture of Character and Word N-grams</title>
    <author><first>Elham</first><last>Mohammadi</last></author>
    <author><first>Hadi</first><last>Veisi</last></author>
    <author><first>Hessam</first><last>Amini</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>210&#8211;216</pages>
    <url>http://www.aclweb.org/anthology/W17-5022</url>
    <attachment type="attachment">W17-5022.Attachment.pdf</attachment>
    <abstract>Native language identification (NLI) is the task of determining an author's
	native language, based on a piece of his/her writing in a second language. In
	recent years, NLI has received much attention due to its challenging nature and
	its applications in language pedagogy and forensic linguistics. We participated
	in the NLI2017 shared task under the name UT-DSP. In our effort to implement a
	method for native language identification, we made use of a fusion of character
	and word N-grams, and achieved an optimal F1-Score of 77.64%, using both essay
	and speech transcription datasets.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>mohammadi-veisi-amini:2017:BEA</bibkey>
  </paper>

  <paper id="5023">
    <title>Ensemble Methods for Native Language Identification</title>
    <author><first>Sophia</first><last>Chan</last></author>
    <author><first>Maryam</first><last>Honari Jahromi</last></author>
    <author><first>Benjamin</first><last>Benetti</last></author>
    <author><first>Aazim</first><last>Lakhani</last></author>
    <author><first>Alona</first><last>Fyshe</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>217&#8211;223</pages>
    <url>http://www.aclweb.org/anthology/W17-5023</url>
    <abstract>Our team—Uvic-NLP—explored and evaluated a variety of lexical features for
	Native Language Identification (NLI) within the framework of ensemble methods.
	Using a subset of the highest performing features, we train Support Vector
	Machines (SVM) and Fully Connected Neural Networks (FCNN) as base classifiers,
	and test different methods for combining their outputs. Restricting our scope
	to the closed essay track in the NLI Shared Task 2017, we find that our best
	SVM ensemble achieves an F1 score of 0.8730 on the test set.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>chan-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5024">
    <title>Can string kernels pass the test of time in Native Language Identification?</title>
    <author><first>Radu Tudor</first><last>Ionescu</last></author>
    <author><first>Marius</first><last>Popescu</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>224&#8211;234</pages>
    <url>http://www.aclweb.org/anthology/W17-5024</url>
    <abstract>We describe a machine learning approach for the 2017 shared task on Native
	Language Identification (NLI). The proposed approach combines several kernels
	using multiple kernel learning. While most of our kernels are based on
	character p-grams (also known as n-grams) extracted from essays or speech
	transcripts, we also use a kernel based on i-vectors, a low-dimensional
	representation of audio recordings, provided by the shared task organizers. For
	the learning stage, we choose Kernel Discriminant Analysis (KDA) over Kernel
	Ridge Regression (KRR), because the former classifier obtains better results
	than the latter one on the development set. In our previous work, we have used
	a similar machine learning approach to achieve state-of-the-art NLI results.
	The goal of this paper is to demonstrate that our shallow and simple approach
	based on string kernels (with minor improvements) can pass the test of time and
	reach state-of-the-art performance in the 2017 NLI shared task, despite the
	recent advances in natural language processing. We participated in all three
	tracks, in which the competitors were allowed to use only the essays (essay
	track), only the speech transcripts (speech track), or both (fusion track).
	Using only the data provided by the organizers for training our models, we have
	reached a macro F1 score of 86.95% in the closed essay track, a macro F1 score
	of 87.55% in the closed speech track, and a macro F1 score of 93.19% in the
	closed fusion track. With these scores, our team (UnibucKernel) ranked in the
	first group of teams in all three tracks, while attaining the best scores in
	the speech and the fusion tracks.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>ionescu-popescu:2017:BEA</bibkey>
  </paper>

  <paper id="5025">
    <title>Neural Networks and Spelling Features for Native Language Identification</title>
    <author><first>Johannes</first><last>Bjerva</last></author>
    <author><first>Gintare</first><last>Grigonyte</last></author>
    <author><first>Robert</first><last>&#214;stling</last></author>
    <author><first>Barbara</first><last>Plank</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>235&#8211;239</pages>
    <url>http://www.aclweb.org/anthology/W17-5025</url>
    <abstract>We present the RUG-SU team's submission at the Native Language Identification
	Shared Task 2017.
	We combine several approaches into an ensemble, based on spelling error
	features, a simple neural network using word representations, a deep residual
	network using word and character features, and a system based on a recurrent
	neural network. 
	Our best system is an ensemble of neural networks, reaching an F1 score of
	0.8323.
	Although our system is not the highest ranking one, we do outperform the
	baseline by far.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>bjerva-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5026">
    <title>A study of N-gram and Embedding Representations for Native Language Identification</title>
    <author><first>Sowmya</first><last>Vajjala</last></author>
    <author><first>Sagnik</first><last>Banerjee</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>240&#8211;248</pages>
    <url>http://www.aclweb.org/anthology/W17-5026</url>
    <abstract>We report on our experiments with N-gram and embedding based feature
	representations for Native Language Identification (NLI) as a part of the NLI
	Shared Task 2017 (team name: NLI-ISU). Our best performing system on the test
	set for written essays had a macro F1 of 0.8264 and was based on word uni, bi
	and trigram features. We explored n-grams covering word, character, POS and
	word-POS mixed representations for this task. For embedding based feature
	representations, we employed both word and document embeddings. We had a
	relatively poor performance with all embedding representations compared to
	n-grams, which could be because of the fact that embeddings capture semantic
	similarities whereas L1 differences are more stylistic in nature.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>vajjala-banerjee:2017:BEA</bibkey>
  </paper>

  <paper id="5027">
    <title>A Shallow Neural Network for Native Language Identification with Character N-grams</title>
    <author><first>Yunita</first><last>Sari</last></author>
    <author><first>Muhammad</first><last>Rifqi Fatchurrahman</last></author>
    <author><first>Meisyarah</first><last>Dwiastuti</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>249&#8211;254</pages>
    <url>http://www.aclweb.org/anthology/W17-5027</url>
    <abstract>This paper describes the systems submitted by GadjahMada team to the Native
	Language Identification (NLI) Shared Task 2017. Our models used a continuous
	representation of character n-grams which are learned jointly with feed-forward
	neural network classifier. Character n-grams have been proved to be effective
	for style-based identification tasks including NLI. Results on the test set
	demonstrate that the proposed model performs very well on essay and fusion
	tracks by obtaining more than 0.8 on both F-macro score and accuracy.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>sari-rifqifatchurrahman-dwiastuti:2017:BEA</bibkey>
  </paper>

  <paper id="5028">
    <title>Fewer features perform well at Native Language Identification task</title>
    <author><first>Taraka</first><last>Rama</last></author>
    <author><first>&#199;a&#287;rı</first><last>&#199;&#246;ltekin</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>255&#8211;260</pages>
    <url>http://www.aclweb.org/anthology/W17-5028</url>
    <abstract>This paper describes our results at the NLI shared task 2017.  We participated
	in essays, speech, and fusion task that uses text, speech, and i-vectors for
	the task of identifying the native language of the given input. In the essay
	track, a linear SVM system using word bigrams and character 7-grams performed
	the best. In the speech track, an LDA classifier based only on i-vectors
	performed better than a combination system using text features from speech
	transcriptions and i-vectors. In the fusion task, we experimented with systems
	that used combination of i-vectors with higher order n-grams features,
	combination of i-vectors with word unigrams, a mean probability ensemble, and a
	stacked ensemble system. Our finding is that word unigrams in combination with
	i-vectors achieve higher score than systems trained with larger number of
	n-gram features.  Our best-performing systems achieved F1-scores of
	87.16%, 83.33% and 91.75% on the essay track, the speech track and the fusion
	track respectively.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>rama-ccoltekin:2017:BEA</bibkey>
  </paper>

  <paper id="5029">
    <title>Structured Generation of Technical Reading Lists</title>
    <author><first>Jonathan</first><last>Gordon</last></author>
    <author><first>Stephen</first><last>Aguilar</last></author>
    <author><first>Emily</first><last>Sheng</last></author>
    <author><first>Gully</first><last>Burns</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>261&#8211;270</pages>
    <url>http://www.aclweb.org/anthology/W17-5029</url>
    <abstract>Learners need to find suitable documents to read and prioritize them in an
	appropriate order. We present a method of automatically generating reading
	lists, selecting documents based on their pedagogical value to the learner and
	ordering them using the structure of concepts in the domain. Resulting reading
	lists related to computational linguistics were evaluated by advanced learners
	and judged to be near the quality of those generated by domain experts. We
	provide an open-source implementation of our method to enable future work on
	reading list generation.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>gordon-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5030">
    <title>Effects of Lexical Properties on Viewing Time per Word in Autistic and Neurotypical Readers</title>
    <author><first>Sanja</first><last>&#x160;tajner</last></author>
    <author><first>Victoria</first><last>Yaneva</last></author>
    <author><first>Ruslan</first><last>Mitkov</last></author>
    <author><first>Simone Paolo</first><last>Ponzetto</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>271&#8211;281</pages>
    <url>http://www.aclweb.org/anthology/W17-5030</url>
    <abstract>Eye tracking studies from the past few decades have shaped the way we think
	of word complexity and cognitive load: words that are long, rare and ambiguous
	are more difficult to read. However, online processing techniques have been
	scarcely applied to investigating the reading difficulties of people with
	autism and what vocabulary is challenging for them. We present parallel gaze
	data obtained from adult readers with autism and a control group of
	neurotypical readers and show that the former required higher cognitive
	effort to comprehend the texts as evidenced by three gaze-based measures. We
	divide all words into four classes based on their viewing times for both groups
	and investigate the relationship between longer viewing times and word length,
	word frequency, and four cognitively-based measures (word concreteness,
	familiarity, age of acquisition and imagability).</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>vstajner-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5031">
    <title>Transparent text quality assessment with convolutional neural networks</title>
    <author><first>Robert</first><last>&#214;stling</last></author>
    <author><first>Gintare</first><last>Grigonyte</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>282&#8211;286</pages>
    <url>http://www.aclweb.org/anthology/W17-5031</url>
    <abstract>We present a very simple model for text quality assessment based on a deep
	convolutional neural network, where the only supervision required is one corpus
	of user-generated text of varying quality, and one contrasting text corpus of
	consistently high quality. Our model is able to provide local quality
	assessments in different parts of a text, which allows visual feedback about
	where potentially problematic parts of the text are located, as well as a way
	to evaluate which textual features are captured by our model. We evaluate our
	method on two corpora: a large corpus of manually graded student essays and a
	longitudinal corpus of language learner written production, and find that the
	text quality metric learned by our model is a fairly strong predictor of both
	essay grade and learner proficiency level.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>ostling-grigonyte:2017:BEA</bibkey>
  </paper>

  <paper id="5032">
    <title>Artificial Error Generation with Machine Translation and Syntactic Patterns</title>
    <author><first>Marek</first><last>Rei</last></author>
    <author><first>Mariano</first><last>Felice</last></author>
    <author><first>Zheng</first><last>Yuan</last></author>
    <author><first>Ted</first><last>Briscoe</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>287&#8211;292</pages>
    <url>http://www.aclweb.org/anthology/W17-5032</url>
    <abstract>Shortage of available training data is holding back progress in the area of
	automated error detection.
	This paper investigates two alternative methods for artificially generating
	writing errors, in order to create additional resources.
	We propose treating error generation as a machine translation task, where
	grammatically correct text is translated to contain errors.
	In addition, we explore a system for extracting textual patterns from an
	annotated corpus, which can then be used to insert errors into grammatically
	correct sentences.
	Our experiments show that the inclusion of artificially generated errors
	significantly improves error detection accuracy on both FCE and CoNLL 2014
	datasets.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>rei-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5033">
    <title>Modelling semantic acquisition in second language learning</title>
    <author><first>Ekaterina</first><last>Kochmar</last></author>
    <author><first>Ekaterina</first><last>Shutova</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>293&#8211;302</pages>
    <url>http://www.aclweb.org/anthology/W17-5033</url>
    <abstract>Using methods of statistical analysis, we investigate how semantic knowledge is
	acquired in English as a second language and evaluate the pace of development
	across a number of predicate types and content word combinations, as well as
	across the levels of language proficiency and native languages. Our exploratory
	study helps identify the most problematic areas for language learners with
	different backgrounds and at different stages of learning.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>kochmar-shutova:2017:BEA</bibkey>
  </paper>

  <paper id="5034">
    <title>Multiple Choice Question Generation Utilizing An Ontology</title>
    <author><first>Katherine</first><last>Stasaski</last></author>
    <author><first>Marti A.</first><last>Hearst</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>303&#8211;312</pages>
    <url>http://www.aclweb.org/anthology/W17-5034</url>
    <attachment type="attachment">W17-5034.Attachment.zip</attachment>
    <abstract>Ontologies provide a structured representation of concepts and the
	relationships which connect them. This work investigates how a pre-existing
	educational Biology ontology can be used to generate useful practice questions
	for students by using the connectivity structure in a novel way. It also
	introduces a novel way to generate multiple-choice distractors from the
	ontology, and compares this to a baseline of using embedding representations of
	nodes. An assessment by an experienced science teacher shows a significant
	advantage over a baseline when using the ontology for distractor generation. A
	subsequent study with three science teachers on the results of a modified
	question generation algorithm finds significant improvements. An in-depth
	analysis of the teachers’ comments yields useful insights for any researcher
	working on automated question generation for educational applications.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>stasaski-hearst:2017:BEA</bibkey>
  </paper>

  <paper id="5035">
    <title>Simplifying metaphorical language for young readers: A corpus study on news text</title>
    <author><first>Magdalena</first><last>Wolska</last></author>
    <author><first>Yulia</first><last>Clausen</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>313&#8211;318</pages>
    <url>http://www.aclweb.org/anthology/W17-5035</url>
    <abstract>The paper presents first results of an ongoing project on text
	simplification focusing on linguistic metaphors. Based on an analysis
	of a parallel corpus of news text professionally simplified for
	different grade levels, we identify six types of simplification
	choices falling into two broad categories: preserving metaphors or
	dropping them. An annotation study on almost 300 source sentences with
	metaphors (grade level 12) and their simplified counterparts (grade~4)
	is conducted. The results show that most metaphors are preserved and
	when they are dropped, the semantic content tends to be preserved
	rather than dropped, however, it is reworded without metaphorical
	language. In general, some of the expected tendencies in complexity
	reduction, measured with psycholinguistic variables linked to metaphor
	comprehension, are observed, suggesting good prospect for machine
	learning-based metaphor simplification.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>wolska-clausen:2017:BEA</bibkey>
  </paper>

  <paper id="5036">
    <title>Language Based Mapping of Science Assessment Items to Skills</title>
    <author><first>Farah</first><last>Nadeem</last></author>
    <author><first>Mari</first><last>Ostendorf</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>319&#8211;326</pages>
    <url>http://www.aclweb.org/anthology/W17-5036</url>
    <attachment type="attachment">W17-5036.Attachment.zip</attachment>
    <abstract>Knowledge of the association between assessment questions and the skills
	required to solve them is necessary for analysis of student learning. This
	association, often represented as a Q-matrix, is either hand-labeled by domain
	experts or learned as latent variables given a large student response data set.
	As a means of automating the match to formal standards, this paper uses neural
	text classification methods, leveraging the language in the standards documents
	to identify online text for a proxy training task. Experiments involve
	identifying the topic and crosscutting concepts of middle school science
	questions leveraging multi-task training. Results show that it is possible to
	automatically build a Q-matrix without student response data and using a modest
	number of hand-labeled questions.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>nadeem-ostendorf:2017:BEA</bibkey>
  </paper>

  <paper id="5037">
    <title>Connecting the Dots: Towards Human-Level Grammatical Error Correction</title>
    <author><first>Shamil</first><last>Chollampatt</last></author>
    <author><first>Hwee Tou</first><last>Ng</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>327&#8211;333</pages>
    <url>http://www.aclweb.org/anthology/W17-5037</url>
    <abstract>We build a grammatical error correction (GEC) system primarily based on the
	state-of-the-art statistical machine translation (SMT) approach, using
	task-specific features and tuning, and further enhance it with the modeling
	power of neural network joint models. The SMT-based system is weak in
	generalizing beyond patterns seen during training and lacks granularity below
	the word level. To address this issue, we incorporate a character-level SMT
	component targeting the misspelled words that the original SMT-based system
	fails to correct. Our final system achieves 53.14% F 0.5 score on the benchmark
	CoNLL-2014 test set, an improvement of 3.62% F 0.5 over the best previous
	published score.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>chollampatt-ng:2017:BEA</bibkey>
  </paper>

  <paper id="5038">
    <title>Question Generation for Language Learning: From ensuring texts are read to supporting learning</title>
    <author><first>Maria</first><last>Chinkina</last></author>
    <author><first>Detmar</first><last>Meurers</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>334&#8211;344</pages>
    <url>http://www.aclweb.org/anthology/W17-5038</url>
    <abstract>In Foreign Language Teaching and Learning (FLTL), questions are systematically
	used to assess the learner’s understanding of a text. Computational
	linguistic approaches have been developed to generate such questions
	automatically given a text (e.g., Heilman, 2011). In this paper, we want to
	broaden the perspective on the different functions questions can play in FLTL
	and discuss how automatic question generation can support the different uses.
	Complementing the focus on meaning and comprehension, we want to highlight the
	fact that questions can also be used to make learners notice form aspects of
	the linguistic system and their interpretation. Automatically generating
	questions that target linguistic forms and grammatical categories in a text in
	essence supports incidental focus-on-form (Loewen, 2005) in a meaning-focused
	reading task. We discuss two types of questions serving this purpose, how they
	can be generated automatically; and we report on a crowd-sourcing evaluation
	comparing automatically generated to manually written questions targeting
	particle verbs, a challenging linguistic form for learners of English.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>chinkina-meurers:2017:BEA</bibkey>
  </paper>

  <paper id="5039">
    <title>Systematically Adapting Machine Translation for Grammatical Error Correction</title>
    <author><first>Courtney</first><last>Napoles</last></author>
    <author><first>Chris</first><last>Callison-Burch</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>345&#8211;356</pages>
    <url>http://www.aclweb.org/anthology/W17-5039</url>
    <abstract>n this work we adapt machine translation (MT) to grammatical error correction,
	identifying how components of the statistical MT pipeline can be modified for
	this task and analyzing how each modification impacts system performance. We
	evaluate the contribution of each of these components with standard evaluation
	metrics and automatically characterize the morphological and lexical
	transformations made in system output. Our model rivals the current state of
	the art using a fraction of the training data.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>napoles-callisonburch:2017:BEA</bibkey>
  </paper>

  <paper id="5040">
    <title>Fine-grained essay scoring of a complex writing task for native speakers</title>
    <author><first>Andrea</first><last>Horbach</last></author>
    <author><first>Dirk</first><last>Scholten-Akoun</last></author>
    <author><first>Yuning</first><last>Ding</last></author>
    <author><first>Torsten</first><last>Zesch</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>357&#8211;366</pages>
    <url>http://www.aclweb.org/anthology/W17-5040</url>
    <abstract>Automatic essay scoring is nowadays successfully used even in high-stakes
	tests, but this is mainly limited to holistic scoring of learner essays.
	We present a new dataset of essays written by highly proficient German native
	speakers that is scored using a fine-grained rubric with the goal to provide
	detailed feedback.
	Our experiments with two state-of-the-art scoring systems (a neural and a
	SVM-based one)                                show a large drop in performance
	compared to
	existing
	datasets.
	This demonstrates the need for such datasets that allow to guide research on
	more elaborate essay scoring methods.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>horbach-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5041">
    <title>Exploring Optimal Voting in Native Language Identification</title>
    <author><first>Cyril</first><last>Goutte</last></author>
    <author><first>Serge</first><last>L&#233;ger</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>367&#8211;373</pages>
    <url>http://www.aclweb.org/anthology/W17-5041</url>
    <abstract>We describe the submissions entered by the National Research Council
	Canada in the NLI-2017 evaluation. We mainly explored the use of
	voting, and various ways to optimize the choice and number of voting
	systems.  We also explored the use of features that rely on no
	linguistic preprocessing. Long ngrams of characters obtained from raw
	text turned out to yield the best performance on all textual input
	(written essays and speech transcripts). Voting ensembles turned out
	to produce small performance gains, with little difference between the
	various optimization strategies we tried. Our top systems achieved
	accuracies of 87% on the essay track, 84% on the
	speech track, and close to 92% by combining essays, speech and
	i-vectors in the fusion track.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>goutte-leger:2017:BEA</bibkey>
  </paper>

  <paper id="5042">
    <title>CIC-FBK Approach to Native Language Identification</title>
    <author><first>Ilia</first><last>Markov</last></author>
    <author><first>Lingzhen</first><last>Chen</last></author>
    <author><first>Carlo</first><last>Strapparava</last></author>
    <author><first>Grigori</first><last>Sidorov</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>374&#8211;381</pages>
    <url>http://www.aclweb.org/anthology/W17-5042</url>
    <abstract>We present the CIC-FBK system, which took part in the Native Language
	Identification (NLI) Shared Task 2017. Our approach combines features commonly
	used in previous NLI research, i.e., word n-grams, lemma n-grams,
	part-of-speech n-grams, and function words, with recently introduced character
	n-grams from misspelled words, and features that are novel in this task, such
	as typed character n-grams, and syntactic n-grams of words and of syntactic
	relation tags. We use log-entropy weighting scheme and perform classification
	using the Support Vector Machines (SVM) algorithm. Our system achieved 0.8808
	macro-averaged F1-score and shared the 1st rank in the NLI Shared Task 2017
	scoring.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>markov-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5043">
    <title>The Power of Character N-grams in Native Language Identification</title>
    <author><first>Artur</first><last>Kulmizev</last></author>
    <author><first>Bo</first><last>Blankers</last></author>
    <author><first>Johannes</first><last>Bjerva</last></author>
    <author><first>Malvina</first><last>Nissim</last></author>
    <author><first>Gertjan</first><last>van Noord</last></author>
    <author><first>Barbara</first><last>Plank</last></author>
    <author><first>Martijn</first><last>Wieling</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>382&#8211;389</pages>
    <url>http://www.aclweb.org/anthology/W17-5043</url>
    <abstract>In this paper, we explore the performance of a linear SVM trained on language 
	independent character features for the NLI Shared Task 2017. Our basic system
	(GRONINGEN) achieves the best performance (87.56 F1-score) on the evaluation
	set using only 1-9 character n-grams as features. We compare this against
	several ensemble and meta-classifiers in order to examine how the linear system
	fares when combined with other, especially non-linear classifiers. Special
	emphasis is placed on the topic bias that exists by virtue of the assessment
	essay prompt distribution.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>kulmizev-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5044">
    <title>Classifier Stacking for Native Language Identification</title>
    <author><first>Wen</first><last>Li</last></author>
    <author><first>Liang</first><last>Zou</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>390&#8211;397</pages>
    <url>http://www.aclweb.org/anthology/W17-5044</url>
    <abstract>This paper reports our contribution (team WLZ) to the NLI Shared Task 2017
	(essay track). We first extract lexical and syntactic features from the essays,
	perform feature weighting and selection, and train linear support vector
	machine (SVM) classifiers each on an individual feature type. The output of
	base classifiers, as probabilities for each class, are then fed into a
	multilayer perceptron to predict the native language of the author. We also
	report the performance of each feature type, as well as the best features of a
	type. Our system achieves an accuracy of 86.55%, which is among the best
	performing systems of this shared task.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>li-zou:2017:BEA</bibkey>
  </paper>

  <paper id="5045">
    <title>Native Language Identification on Text and Speech</title>
    <author><first>Marcos</first><last>Zampieri</last></author>
    <author><first>Alina Maria</first><last>Ciobanu</last></author>
    <author><first>Liviu P.</first><last>Dinu</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>398&#8211;404</pages>
    <url>http://www.aclweb.org/anthology/W17-5045</url>
    <abstract>This paper presents an ensemble system combining the output of multiple SVM
	classifiers to native language identification (NLI). The system was submitted
	to the NLI Shared Task 2017 fusion track which featured students essays and
	spoken responses in form of audio transcriptions and iVectors by non-native
	English speakers of eleven native languages. Our system competed in the
	challenge under the team name ZCD and was based on an ensemble of SVM
	classifiers trained on character n-grams achieving 83.58% accuracy and ranking
	3rd in the shared task.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>zampieri-ciobanu-dinu:2017:BEA</bibkey>
  </paper>

  <paper id="5046">
    <title>Native Language Identification using Phonetic Algorithms</title>
    <author><first>Charese</first><last>Smiley</last></author>
    <author><first>Sandra</first><last>K&#252;bler</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>405&#8211;412</pages>
    <url>http://www.aclweb.org/anthology/W17-5046</url>
    <abstract>In this paper, we discuss the results of the IUCL system in the NLI Shared Task
	2017. For our system, we explore a variety of phonetic algorithms to generate
	features for Native Language Identification. These features are contrasted with
	one of the most successful type of features in NLI, character n-grams. We find
	that although phonetic features do not perform as well as character n-grams
	alone, they do increase overall F1 score when used together with character
	n-grams.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>smiley-kubler:2017:BEA</bibkey>
  </paper>

  <paper id="5047">
    <title>A deep-learning based native-language classification by using a latent semantic analysis for the NLI Shared Task 2017</title>
    <author><first>Yoo Rhee</first><last>Oh</last></author>
    <author><first>Hyung-Bae</first><last>Jeon</last></author>
    <author><first>Hwa Jeon</first><last>Song</last></author>
    <author><first>Yun-Kyung</first><last>Lee</last></author>
    <author><first>Jeon-Gue</first><last>Park</last></author>
    <author><first>Yun-Keun</first><last>Lee</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>413&#8211;422</pages>
    <url>http://www.aclweb.org/anthology/W17-5047</url>
    <abstract>This paper proposes a deep-learning based native-language identification (NLI)
	using a latent semantic analysis (LSA) as a participant (ETRI-SLP) of the NLI
	Shared Task 2017 where the NLI Shared Task 2017 aims to detect the native
	language of an essay or speech response of a standardized assessment of English
	proficiency for academic purposes. To this end, we use the six unit forms of a
	text data such as character 4/5/6-grams and word 1/2/3-grams. For each unit
	form of text data, we convert it into a count-based vector, extract a 2000-rank
	LSA feature, and perform a linear discriminant analysis (LDA) based dimension
	reduction. From the count-based vector or the LSA-LDA feature, we also obtain
	the output prediction values of a support vector machine (SVM) based
	classifier, the output prediction values of a deep neural network (DNN) based
	classifier, and the bottleneck values of a DNN based classifier. In order to
	incorporate the various kinds of text-based features and a speech-based
	i-vector feature, we design two DNN based ensemble classifiers for late fusion
	and early fusion, respectively. From the NLI experiments, the F1 (macro) scores
	are obtained as 0.8601, 0.8664, and 0.9220 for the essay track, the speech
	track, and the fusion track, respectively. The proposed method has comparable
	performance to the top-ranked teams for the speech and fusion tracks, although
	it has slightly lower performance for the essay track.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>oh-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5048">
    <title>Fusion of Simple Models for Native Language Identification</title>
    <author><first>Fabio</first><last>Kepler</last></author>
    <author><first>Ram&#243;n</first><last>Astudillo</last></author>
    <author><first>Alberto</first><last>Abad</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>423&#8211;429</pages>
    <url>http://www.aclweb.org/anthology/W17-5048</url>
    <abstract>In this paper we describe the approaches we explored for the 2017 Native
	Language Identification shared task. We focused on simple word and sub-word
	units avoiding heavy use of hand-crafted features. Following recent trends, we
	explored linear and neural networks models to attempt to compensate for the
	lack of rich feature use. Initial efforts yielded f1-scores of 82.39% and
	83.77% in the development and test sets of the fusion track, and were
	officially submitted to the task as team L2F. After the task was closed, we
	carried on further experiments and relied on a late fusion strategy for
	combining our simple proposed approaches with modifications of the baselines
	provided by the task. As expected, the i-vectors based sub-system dominates the
	performance of the system combinations, and results in the major contributor to
	our achieved scores. Our best combined system achieves 90.1% and 90.2% f1-score
	in the development and test sets of the fusion track, respectively.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>kepler-astudillo-abad:2017:BEA</bibkey>
  </paper>

  <paper id="5049">
    <title>Stacked Sentence-Document Classifier Approach for Improving Native Language Identification</title>
    <author><first>Andrea</first><last>Cimino</last></author>
    <author><first>Felice</first><last>Dell'Orletta</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>430&#8211;437</pages>
    <url>http://www.aclweb.org/anthology/W17-5049</url>
    <abstract>In this paper, we describe the approach of the ItaliaNLP Lab team to native
	language identification and discuss the results we submitted as participants to
	the essay track of NLI Shared Task 2017. We introduce for the first time a
	2-stacked sentence-document architecture for native language identification
	that is able to exploit both local sentence information and a wide set of
	general-purpose features qualifying the lexical and grammatical structure of
	the whole document. When evaluated on the official test set, our
	sentence-document stacked architecture obtained the best result among all the
	participants of the essay track with an F1 score of 0.8818.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>cimino-dellorletta:2017:BEA</bibkey>
  </paper>

  <paper id="5050">
    <title>Using Gaze to Predict Text Readability</title>
    <author><first>Ana Valeria</first><last>Gonzalez-Gardu&#241;o</last></author>
    <author><first>Anders</first><last>S&#248;gaard</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>438&#8211;443</pages>
    <url>http://www.aclweb.org/anthology/W17-5050</url>
    <abstract>We show that text readability prediction improves significantly from hard
	parameter sharing with models predicting first pass duration, total fixation
	duration and regression duration. Specifically, we induce multi-task Multilayer
	Perceptrons and Logistic Regression models over sentence representations that
	capture various aggregate statistics, from two different text readability
	corpora for English, as well as the Dundee eye-tracking corpus. Our approach
	leads to significant improvements over Single task learning and over previous
	systems. In addition, our improvements are consistent across train sample
	sizes, making our approach especially applicable to small datasets.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>gonzalezgarduno-sogaard:2017:BEA</bibkey>
  </paper>

  <paper id="5051">
    <title>Annotating Orthographic Target Hypotheses in a German L1 Learner Corpus</title>
    <author><first>Ronja</first><last>Laarmann-Quante</last></author>
    <author><first>Katrin</first><last>Ortmann</last></author>
    <author><first>Anna</first><last>Ehlert</last></author>
    <author><first>Maurice</first><last>Vogel</last></author>
    <author><first>Stefanie</first><last>Dipper</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>444&#8211;456</pages>
    <url>http://www.aclweb.org/anthology/W17-5051</url>
    <abstract>NLP applications for learners often rely on annotated learner corpora. Thereby,
	it is important that the annotations are both meaningful for the task, and
	consistent and reliable. We present a new longitudinal L1 learner corpus for
	German (handwritten texts collected in grade 2&#8211;4), which is transcribed and
	annotated with a target hypothesis that strictly only corrects orthographic
	errors, and is thereby tailored to research and tool development for
	orthographic issues in primary school. While for most corpora, transcription
	and target hypothesis are not evaluated, we conducted a detailed
	inter-annotator agreement study for both tasks. Although we achieved high
	agreement, our discussion of cases of disagreement shows that even with
	detailed guidelines, annotators differ here and there for different reasons,
	which should also be considered when working with transcriptions and target
	hypotheses of other corpora, especially if no explicit guidelines for their
	construction are known.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>laarmannquante-EtAl:2017:BEA</bibkey>
  </paper>

  <paper id="5052">
    <title>A Large Scale Quantitative Exploration of Modeling Strategies for Content Scoring</title>
    <author><first>Nitin</first><last>Madnani</last></author>
    <author><first>Anastassia</first><last>Loukina</last></author>
    <author><first>Aoife</first><last>Cahill</last></author>
    <booktitle>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</booktitle>
    <month>September</month>
    <year>2017</year>
    <address>Copenhagen, Denmark</address>
    <publisher>Association for Computational Linguistics</publisher>
    <pages>457&#8211;467</pages>
    <url>http://www.aclweb.org/anthology/W17-5052</url>
    <abstract>We explore various supervised learning strategies for automated scoring of
	content knowledge for a large corpus of 130 different content-based questions
	spanning four subject areas (Science, Math, English Language Arts, and Social
	Studies) and containing over 230,000 responses scored by human raters. Based on
	our analyses, we provide specific recommendations for content scoring. These
	are based on patterns observed across multiple questions and assessments and
	are, therefore, likely to generalize to other scenarios and prove useful to the
	community as automated content scoring becomes more popular in schools and
	classrooms.</abstract>
    <bibtype>inproceedings</bibtype>
    <bibkey>madnani-loukina-cahill:2017:BEA</bibkey>
  </paper>

</volume>

