@inproceedings{ehara-2021-evaluation,
title = "Evaluation of Unsupervised Automatic Readability Assessors Using Rank Correlations",
author = "Ehara, Yo",
editor = "Gao, Yang and
Eger, Steffen and
Zhao, Wei and
Lertvittayakumjorn, Piyawat and
Fomicheva, Marina",
booktitle = "Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eval4nlp-1.7",
doi = "10.18653/v1/2021.eval4nlp-1.7",
pages = "62--72",
abstract = "Automatic readability assessment (ARA) is the task of automatically assessing readability with little or no human supervision. ARA is essential for many second language acquisition applications to reduce the workload of annotators, who are usually language teachers. Previous unsupervised approaches manually searched textual features that correlated well with readability labels, such as perplexity scores of large language models. This paper argues that, to evaluate an assessors{'} performance, rank-correlation coefficients should be used instead of Pearson{'}s correlation coefficient ($\rho$). In the experiments, we show that its performance can be easily underestimated using Pearson{'}s $\rho$, which is significantly affected by the linearity of the output readability scores. We also propose a lightweight unsupervised readability assessor that achieved the best performance in both the rank correlations and Pearson{'}s $\rho$ among all unsupervised assessors compared.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ehara-2021-evaluation">
<titleInfo>
<title>Evaluation of Unsupervised Automatic Readability Assessors Using Rank Correlations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yo</namePart>
<namePart type="family">Ehara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyawat</namePart>
<namePart type="family">Lertvittayakumjorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Fomicheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic readability assessment (ARA) is the task of automatically assessing readability with little or no human supervision. ARA is essential for many second language acquisition applications to reduce the workload of annotators, who are usually language teachers. Previous unsupervised approaches manually searched textual features that correlated well with readability labels, such as perplexity scores of large language models. This paper argues that, to evaluate an assessors’ performance, rank-correlation coefficients should be used instead of Pearson’s correlation coefficient (ρ). In the experiments, we show that its performance can be easily underestimated using Pearson’s ρ, which is significantly affected by the linearity of the output readability scores. We also propose a lightweight unsupervised readability assessor that achieved the best performance in both the rank correlations and Pearson’s ρ among all unsupervised assessors compared.</abstract>
<identifier type="citekey">ehara-2021-evaluation</identifier>
<identifier type="doi">10.18653/v1/2021.eval4nlp-1.7</identifier>
<location>
<url>https://aclanthology.org/2021.eval4nlp-1.7</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>62</start>
<end>72</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation of Unsupervised Automatic Readability Assessors Using Rank Correlations
%A Ehara, Yo
%Y Gao, Yang
%Y Eger, Steffen
%Y Zhao, Wei
%Y Lertvittayakumjorn, Piyawat
%Y Fomicheva, Marina
%S Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F ehara-2021-evaluation
%X Automatic readability assessment (ARA) is the task of automatically assessing readability with little or no human supervision. ARA is essential for many second language acquisition applications to reduce the workload of annotators, who are usually language teachers. Previous unsupervised approaches manually searched textual features that correlated well with readability labels, such as perplexity scores of large language models. This paper argues that, to evaluate an assessors’ performance, rank-correlation coefficients should be used instead of Pearson’s correlation coefficient (ρ). In the experiments, we show that its performance can be easily underestimated using Pearson’s ρ, which is significantly affected by the linearity of the output readability scores. We also propose a lightweight unsupervised readability assessor that achieved the best performance in both the rank correlations and Pearson’s ρ among all unsupervised assessors compared.
%R 10.18653/v1/2021.eval4nlp-1.7
%U https://aclanthology.org/2021.eval4nlp-1.7
%U https://doi.org/10.18653/v1/2021.eval4nlp-1.7
%P 62-72
Markdown (Informal)
[Evaluation of Unsupervised Automatic Readability Assessors Using Rank Correlations](https://aclanthology.org/2021.eval4nlp-1.7) (Ehara, Eval4NLP 2021)
ACL