@inproceedings{wagner-filho-etal-2016-automatic,
title = "Automatic Construction of Large Readability Corpora",
author = "Wagner Filho, Jorge Alberto and
Wilkens, Rodrigo and
Villavicencio, Aline",
editor = "Brunato, Dominique and
Dell{'}Orletta, Felice and
Venturi, Giulia and
Fran{\c{c}}ois, Thomas and
Blache, Philippe",
booktitle = "Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity ({CL}4{LC})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4119",
pages = "164--173",
abstract = "This work presents a framework for the automatic construction of large Web corpora classified by readability level. We compare different Machine Learning classifiers for the task of readability assessment focusing on Portuguese and English texts, analysing the impact of variables like the feature inventory used in the resulting corpus. In a comparison between shallow and deeper features, the former already produce F-measures of over 0.75 for Portuguese texts, but the use of additional features results in even better results, in most cases. For English, shallow features also perform well as do classic readability formulas. Comparing different classifiers for the task, logistic regression obtained, in general, the best results, but with considerable differences between the results for two and those for three-classes, especially regarding the intermediary class. Given the large scale of the resulting corpus, for evaluation we adopt the agreement between different classifiers as an indication of readability assessment certainty. As a result of this work, a large corpus for Brazilian Portuguese was built, including 1.7 million documents and about 1.6 billion tokens, already parsed and annotated with 134 different textual attributes, along with the agreement among the various classifiers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wagner-filho-etal-2016-automatic">
<titleInfo>
<title>Automatic Construction of Large Readability Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="given">Alberto</namePart>
<namePart type="family">Wagner Filho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rodrigo</namePart>
<namePart type="family">Wilkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dominique</namePart>
<namePart type="family">Brunato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Venturi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">François</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work presents a framework for the automatic construction of large Web corpora classified by readability level. We compare different Machine Learning classifiers for the task of readability assessment focusing on Portuguese and English texts, analysing the impact of variables like the feature inventory used in the resulting corpus. In a comparison between shallow and deeper features, the former already produce F-measures of over 0.75 for Portuguese texts, but the use of additional features results in even better results, in most cases. For English, shallow features also perform well as do classic readability formulas. Comparing different classifiers for the task, logistic regression obtained, in general, the best results, but with considerable differences between the results for two and those for three-classes, especially regarding the intermediary class. Given the large scale of the resulting corpus, for evaluation we adopt the agreement between different classifiers as an indication of readability assessment certainty. As a result of this work, a large corpus for Brazilian Portuguese was built, including 1.7 million documents and about 1.6 billion tokens, already parsed and annotated with 134 different textual attributes, along with the agreement among the various classifiers.</abstract>
<identifier type="citekey">wagner-filho-etal-2016-automatic</identifier>
<location>
<url>https://aclanthology.org/W16-4119</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>164</start>
<end>173</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Construction of Large Readability Corpora
%A Wagner Filho, Jorge Alberto
%A Wilkens, Rodrigo
%A Villavicencio, Aline
%Y Brunato, Dominique
%Y Dell’Orletta, Felice
%Y Venturi, Giulia
%Y François, Thomas
%Y Blache, Philippe
%S Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F wagner-filho-etal-2016-automatic
%X This work presents a framework for the automatic construction of large Web corpora classified by readability level. We compare different Machine Learning classifiers for the task of readability assessment focusing on Portuguese and English texts, analysing the impact of variables like the feature inventory used in the resulting corpus. In a comparison between shallow and deeper features, the former already produce F-measures of over 0.75 for Portuguese texts, but the use of additional features results in even better results, in most cases. For English, shallow features also perform well as do classic readability formulas. Comparing different classifiers for the task, logistic regression obtained, in general, the best results, but with considerable differences between the results for two and those for three-classes, especially regarding the intermediary class. Given the large scale of the resulting corpus, for evaluation we adopt the agreement between different classifiers as an indication of readability assessment certainty. As a result of this work, a large corpus for Brazilian Portuguese was built, including 1.7 million documents and about 1.6 billion tokens, already parsed and annotated with 134 different textual attributes, along with the agreement among the various classifiers.
%U https://aclanthology.org/W16-4119
%P 164-173
Markdown (Informal)
[Automatic Construction of Large Readability Corpora](https://aclanthology.org/W16-4119) (Wagner Filho et al., CL4LC 2016)
ACL
- Jorge Alberto Wagner Filho, Rodrigo Wilkens, and Aline Villavicencio. 2016. Automatic Construction of Large Readability Corpora. In Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC), pages 164–173, Osaka, Japan. The COLING 2016 Organizing Committee.