@inproceedings{jorgensen-sogaard-2021-evaluation,
title = "Evaluation of Summarization Systems across Gender, Age, and Race",
author = "J{\o}rgensen, Anna and
S{\o}gaard, Anders",
editor = "Carenini, Giuseppe and
Cheung, Jackie Chi Kit and
Dong, Yue and
Liu, Fei and
Wang, Lu",
booktitle = "Proceedings of the Third Workshop on New Frontiers in Summarization",
month = nov,
year = "2021",
address = "Online and in Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.newsum-1.6",
doi = "10.18653/v1/2021.newsum-1.6",
pages = "51--56",
abstract = "Summarization systems are ultimately evaluated by human annotators and raters. Usually, annotators and raters do not reflect the demographics of end users, but are recruited through student populations or crowdsourcing platforms with skewed demographics. For two different evaluation scenarios {--} evaluation against gold summaries and system output ratings {--} we show that summary evaluation is sensitive to protected attributes. This can severely bias system development and evaluation, leading us to build models that cater for some groups rather than others.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jorgensen-sogaard-2021-evaluation">
<titleInfo>
<title>Evaluation of Summarization Systems across Gender, Age, and Race</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Jørgensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on New Frontiers in Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Carenini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackie</namePart>
<namePart type="given">Chi</namePart>
<namePart type="given">Kit</namePart>
<namePart type="family">Cheung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and in Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Summarization systems are ultimately evaluated by human annotators and raters. Usually, annotators and raters do not reflect the demographics of end users, but are recruited through student populations or crowdsourcing platforms with skewed demographics. For two different evaluation scenarios – evaluation against gold summaries and system output ratings – we show that summary evaluation is sensitive to protected attributes. This can severely bias system development and evaluation, leading us to build models that cater for some groups rather than others.</abstract>
<identifier type="citekey">jorgensen-sogaard-2021-evaluation</identifier>
<identifier type="doi">10.18653/v1/2021.newsum-1.6</identifier>
<location>
<url>https://aclanthology.org/2021.newsum-1.6</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>51</start>
<end>56</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation of Summarization Systems across Gender, Age, and Race
%A Jørgensen, Anna
%A Søgaard, Anders
%Y Carenini, Giuseppe
%Y Cheung, Jackie Chi Kit
%Y Dong, Yue
%Y Liu, Fei
%Y Wang, Lu
%S Proceedings of the Third Workshop on New Frontiers in Summarization
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and in Dominican Republic
%F jorgensen-sogaard-2021-evaluation
%X Summarization systems are ultimately evaluated by human annotators and raters. Usually, annotators and raters do not reflect the demographics of end users, but are recruited through student populations or crowdsourcing platforms with skewed demographics. For two different evaluation scenarios – evaluation against gold summaries and system output ratings – we show that summary evaluation is sensitive to protected attributes. This can severely bias system development and evaluation, leading us to build models that cater for some groups rather than others.
%R 10.18653/v1/2021.newsum-1.6
%U https://aclanthology.org/2021.newsum-1.6
%U https://doi.org/10.18653/v1/2021.newsum-1.6
%P 51-56
Markdown (Informal)
[Evaluation of Summarization Systems across Gender, Age, and Race](https://aclanthology.org/2021.newsum-1.6) (Jørgensen & Søgaard, NewSum 2021)
ACL