@inproceedings{iskender-etal-2020-best,
title = "Best Practices for Crowd-based Evaluation of {G}erman Summarization: Comparing Crowd, Expert and Automatic Evaluation",
author = {Iskender, Neslihan and
Polzehl, Tim and
M{\"o}ller, Sebastian},
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.eval4nlp-1.16",
doi = "10.18653/v1/2020.eval4nlp-1.16",
pages = "164--175",
abstract = "One of the main challenges in the development of summarization tools is summarization quality evaluation. On the one hand, the human assessment of summarization quality conducted by linguistic experts is slow, expensive, and still not a standardized procedure. On the other hand, the automatic assessment metrics are reported not to correlate high enough with human quality ratings. As a solution, we propose crowdsourcing as a fast, scalable, and cost-effective alternative to expert evaluations to assess the intrinsic and extrinsic quality of summarization by comparing crowd ratings with expert ratings and automatic metrics such as ROUGE, BLEU, or BertScore on a German summarization data set. Our results provide a basis for best practices for crowd-based summarization evaluation regarding major influential factors such as the best annotation aggregation method, the influence of readability and reading effort on summarization evaluation, and the optimal number of crowd workers to achieve comparable results to experts, especially when determining factors such as overall quality, grammaticality, referential clarity, focus, structure {\&} coherence, summary usefulness, and summary informativeness.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="iskender-etal-2020-best">
<titleInfo>
<title>Best Practices for Crowd-based Evaluation of German Summarization: Comparing Crowd, Expert and Automatic Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neslihan</namePart>
<namePart type="family">Iskender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Polzehl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Möller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxime</namePart>
<namePart type="family">Peyrard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the main challenges in the development of summarization tools is summarization quality evaluation. On the one hand, the human assessment of summarization quality conducted by linguistic experts is slow, expensive, and still not a standardized procedure. On the other hand, the automatic assessment metrics are reported not to correlate high enough with human quality ratings. As a solution, we propose crowdsourcing as a fast, scalable, and cost-effective alternative to expert evaluations to assess the intrinsic and extrinsic quality of summarization by comparing crowd ratings with expert ratings and automatic metrics such as ROUGE, BLEU, or BertScore on a German summarization data set. Our results provide a basis for best practices for crowd-based summarization evaluation regarding major influential factors such as the best annotation aggregation method, the influence of readability and reading effort on summarization evaluation, and the optimal number of crowd workers to achieve comparable results to experts, especially when determining factors such as overall quality, grammaticality, referential clarity, focus, structure & coherence, summary usefulness, and summary informativeness.</abstract>
<identifier type="citekey">iskender-etal-2020-best</identifier>
<identifier type="doi">10.18653/v1/2020.eval4nlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2020.eval4nlp-1.16</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>164</start>
<end>175</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Best Practices for Crowd-based Evaluation of German Summarization: Comparing Crowd, Expert and Automatic Evaluation
%A Iskender, Neslihan
%A Polzehl, Tim
%A Möller, Sebastian
%Y Eger, Steffen
%Y Gao, Yang
%Y Peyrard, Maxime
%Y Zhao, Wei
%Y Hovy, Eduard
%S Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F iskender-etal-2020-best
%X One of the main challenges in the development of summarization tools is summarization quality evaluation. On the one hand, the human assessment of summarization quality conducted by linguistic experts is slow, expensive, and still not a standardized procedure. On the other hand, the automatic assessment metrics are reported not to correlate high enough with human quality ratings. As a solution, we propose crowdsourcing as a fast, scalable, and cost-effective alternative to expert evaluations to assess the intrinsic and extrinsic quality of summarization by comparing crowd ratings with expert ratings and automatic metrics such as ROUGE, BLEU, or BertScore on a German summarization data set. Our results provide a basis for best practices for crowd-based summarization evaluation regarding major influential factors such as the best annotation aggregation method, the influence of readability and reading effort on summarization evaluation, and the optimal number of crowd workers to achieve comparable results to experts, especially when determining factors such as overall quality, grammaticality, referential clarity, focus, structure & coherence, summary usefulness, and summary informativeness.
%R 10.18653/v1/2020.eval4nlp-1.16
%U https://aclanthology.org/2020.eval4nlp-1.16
%U https://doi.org/10.18653/v1/2020.eval4nlp-1.16
%P 164-175
Markdown (Informal)
[Best Practices for Crowd-based Evaluation of German Summarization: Comparing Crowd, Expert and Automatic Evaluation](https://aclanthology.org/2020.eval4nlp-1.16) (Iskender et al., Eval4NLP 2020)
ACL