@inproceedings{yaneva-etal-2017-combining,
title = "Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities",
author = "Yaneva, Victoria and
Or{\u{a}}san, Constantin and
Evans, Richard and
Rohanian, Omid",
editor = "Tetreault, Joel and
Burstein, Jill and
Leacock, Claudia and
Yannakoudakis, Helen",
booktitle = "Proceedings of the 12th Workshop on Innovative Use of {NLP} for Building Educational Applications",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-5013",
doi = "10.18653/v1/W17-5013",
pages = "121--132",
abstract = "Given the lack of large user-evaluated corpora in disability-related NLP research (e.g. text simplification or readability assessment for people with cognitive disabilities), the question of choosing suitable training data for NLP models is not straightforward. The use of large generic corpora may be problematic because such data may not reflect the needs of the target population. The use of the available user-evaluated corpora may be problematic because these datasets are not large enough to be used as training data. In this paper we explore a third approach, in which a large generic corpus is combined with a smaller population-specific corpus to train a classifier which is evaluated using two sets of unseen user-evaluated data. One of these sets, the ASD Comprehension corpus, is developed for the purposes of this study and made freely available. We explore the effects of the size and type of the training data used on the performance of the classifiers, and the effects of the type of the unseen test datasets on the classification performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yaneva-etal-2017-combining">
<titleInfo>
<title>Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Yaneva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantin</namePart>
<namePart type="family">Orăsan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Evans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omid</namePart>
<namePart type="family">Rohanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Leacock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helen</namePart>
<namePart type="family">Yannakoudakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Given the lack of large user-evaluated corpora in disability-related NLP research (e.g. text simplification or readability assessment for people with cognitive disabilities), the question of choosing suitable training data for NLP models is not straightforward. The use of large generic corpora may be problematic because such data may not reflect the needs of the target population. The use of the available user-evaluated corpora may be problematic because these datasets are not large enough to be used as training data. In this paper we explore a third approach, in which a large generic corpus is combined with a smaller population-specific corpus to train a classifier which is evaluated using two sets of unseen user-evaluated data. One of these sets, the ASD Comprehension corpus, is developed for the purposes of this study and made freely available. We explore the effects of the size and type of the training data used on the performance of the classifiers, and the effects of the type of the unseen test datasets on the classification performance.</abstract>
<identifier type="citekey">yaneva-etal-2017-combining</identifier>
<identifier type="doi">10.18653/v1/W17-5013</identifier>
<location>
<url>https://aclanthology.org/W17-5013</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>121</start>
<end>132</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities
%A Yaneva, Victoria
%A Orăsan, Constantin
%A Evans, Richard
%A Rohanian, Omid
%Y Tetreault, Joel
%Y Burstein, Jill
%Y Leacock, Claudia
%Y Yannakoudakis, Helen
%S Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F yaneva-etal-2017-combining
%X Given the lack of large user-evaluated corpora in disability-related NLP research (e.g. text simplification or readability assessment for people with cognitive disabilities), the question of choosing suitable training data for NLP models is not straightforward. The use of large generic corpora may be problematic because such data may not reflect the needs of the target population. The use of the available user-evaluated corpora may be problematic because these datasets are not large enough to be used as training data. In this paper we explore a third approach, in which a large generic corpus is combined with a smaller population-specific corpus to train a classifier which is evaluated using two sets of unseen user-evaluated data. One of these sets, the ASD Comprehension corpus, is developed for the purposes of this study and made freely available. We explore the effects of the size and type of the training data used on the performance of the classifiers, and the effects of the type of the unseen test datasets on the classification performance.
%R 10.18653/v1/W17-5013
%U https://aclanthology.org/W17-5013
%U https://doi.org/10.18653/v1/W17-5013
%P 121-132
Markdown (Informal)
[Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities](https://aclanthology.org/W17-5013) (Yaneva et al., BEA 2017)
ACL