@inproceedings{blodgett-etal-2021-stereotyping,
title = "Stereotyping {N}orwegian Salmon: An Inventory of Pitfalls in Fairness Benchmark Datasets",
author = "Blodgett, Su Lin and
Lopez, Gilsinia and
Olteanu, Alexandra and
Sim, Robert and
Wallach, Hanna",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.81",
doi = "10.18653/v1/2021.acl-long.81",
pages = "1004--1015",
abstract = "Auditing NLP systems for computational harms like surfacing stereotypes is an elusive goal. Several recent efforts have focused on benchmark datasets consisting of pairs of contrastive sentences, which are often accompanied by metrics that aggregate an NLP system{'}s behavior on these pairs into measurements of harms. We examine four such benchmarks constructed for two NLP tasks: language modeling and coreference resolution. We apply a measurement modeling lens{---}originating from the social sciences{---}to inventory a range of pitfalls that threaten these benchmarks{'} validity as measurement models for stereotyping. We find that these benchmarks frequently lack clear articulations of what is being measured, and we highlight a range of ambiguities and unstated assumptions that affect how these benchmarks conceptualize and operationalize stereotyping.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blodgett-etal-2021-stereotyping">
<titleInfo>
<title>Stereotyping Norwegian Salmon: An Inventory of Pitfalls in Fairness Benchmark Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Su</namePart>
<namePart type="given">Lin</namePart>
<namePart type="family">Blodgett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gilsinia</namePart>
<namePart type="family">Lopez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Olteanu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Sim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanna</namePart>
<namePart type="family">Wallach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Auditing NLP systems for computational harms like surfacing stereotypes is an elusive goal. Several recent efforts have focused on benchmark datasets consisting of pairs of contrastive sentences, which are often accompanied by metrics that aggregate an NLP system’s behavior on these pairs into measurements of harms. We examine four such benchmarks constructed for two NLP tasks: language modeling and coreference resolution. We apply a measurement modeling lens—originating from the social sciences—to inventory a range of pitfalls that threaten these benchmarks’ validity as measurement models for stereotyping. We find that these benchmarks frequently lack clear articulations of what is being measured, and we highlight a range of ambiguities and unstated assumptions that affect how these benchmarks conceptualize and operationalize stereotyping.</abstract>
<identifier type="citekey">blodgett-etal-2021-stereotyping</identifier>
<identifier type="doi">10.18653/v1/2021.acl-long.81</identifier>
<location>
<url>https://aclanthology.org/2021.acl-long.81</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>1004</start>
<end>1015</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Stereotyping Norwegian Salmon: An Inventory of Pitfalls in Fairness Benchmark Datasets
%A Blodgett, Su Lin
%A Lopez, Gilsinia
%A Olteanu, Alexandra
%A Sim, Robert
%A Wallach, Hanna
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F blodgett-etal-2021-stereotyping
%X Auditing NLP systems for computational harms like surfacing stereotypes is an elusive goal. Several recent efforts have focused on benchmark datasets consisting of pairs of contrastive sentences, which are often accompanied by metrics that aggregate an NLP system’s behavior on these pairs into measurements of harms. We examine four such benchmarks constructed for two NLP tasks: language modeling and coreference resolution. We apply a measurement modeling lens—originating from the social sciences—to inventory a range of pitfalls that threaten these benchmarks’ validity as measurement models for stereotyping. We find that these benchmarks frequently lack clear articulations of what is being measured, and we highlight a range of ambiguities and unstated assumptions that affect how these benchmarks conceptualize and operationalize stereotyping.
%R 10.18653/v1/2021.acl-long.81
%U https://aclanthology.org/2021.acl-long.81
%U https://doi.org/10.18653/v1/2021.acl-long.81
%P 1004-1015
Markdown (Informal)
[Stereotyping Norwegian Salmon: An Inventory of Pitfalls in Fairness Benchmark Datasets](https://aclanthology.org/2021.acl-long.81) (Blodgett et al., ACL-IJCNLP 2021)
ACL
- Su Lin Blodgett, Gilsinia Lopez, Alexandra Olteanu, Robert Sim, and Hanna Wallach. 2021. Stereotyping Norwegian Salmon: An Inventory of Pitfalls in Fairness Benchmark Datasets. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pages 1004–1015, Online. Association for Computational Linguistics.