@inproceedings{pozzobon-etal-2023-challenges,
title = "On the Challenges of Using Black-Box {API}s for Toxicity Evaluation in Research",
author = "Pozzobon, Luiza and
Ermis, Beyza and
Lewis, Patrick and
Hooker, Sara",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.472",
doi = "10.18653/v1/2023.emnlp-main.472",
pages = "7595--7609",
abstract = "Perception of toxicity evolves over time and often differs between geographies and cultural backgrounds. Similarly, black-box commercially available APIs for detecting toxicity, such as the Perspective API, are not static, but frequently retrained to address any unattended weaknesses and biases. We evaluate the implications of these changes on the reproducibility of findings that compare the relative merits of models and methods that aim to curb toxicity. Our findings suggest that research that relied on inherited automatic toxicity scores to compare models and techniques may have resulted in inaccurate findings. Rescoring all models from HELM, a widely respected living benchmark, for toxicity with the recent version of the API led to a different ranking of widely used foundation models. We suggest caution in applying apples-to-apples comparisons between studies and call for a more structured approach to evaluating toxicity over time.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pozzobon-etal-2023-challenges">
<titleInfo>
<title>On the Challenges of Using Black-Box APIs for Toxicity Evaluation in Research</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luiza</namePart>
<namePart type="family">Pozzobon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beyza</namePart>
<namePart type="family">Ermis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Perception of toxicity evolves over time and often differs between geographies and cultural backgrounds. Similarly, black-box commercially available APIs for detecting toxicity, such as the Perspective API, are not static, but frequently retrained to address any unattended weaknesses and biases. We evaluate the implications of these changes on the reproducibility of findings that compare the relative merits of models and methods that aim to curb toxicity. Our findings suggest that research that relied on inherited automatic toxicity scores to compare models and techniques may have resulted in inaccurate findings. Rescoring all models from HELM, a widely respected living benchmark, for toxicity with the recent version of the API led to a different ranking of widely used foundation models. We suggest caution in applying apples-to-apples comparisons between studies and call for a more structured approach to evaluating toxicity over time.</abstract>
<identifier type="citekey">pozzobon-etal-2023-challenges</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.472</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.472</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>7595</start>
<end>7609</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Challenges of Using Black-Box APIs for Toxicity Evaluation in Research
%A Pozzobon, Luiza
%A Ermis, Beyza
%A Lewis, Patrick
%A Hooker, Sara
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F pozzobon-etal-2023-challenges
%X Perception of toxicity evolves over time and often differs between geographies and cultural backgrounds. Similarly, black-box commercially available APIs for detecting toxicity, such as the Perspective API, are not static, but frequently retrained to address any unattended weaknesses and biases. We evaluate the implications of these changes on the reproducibility of findings that compare the relative merits of models and methods that aim to curb toxicity. Our findings suggest that research that relied on inherited automatic toxicity scores to compare models and techniques may have resulted in inaccurate findings. Rescoring all models from HELM, a widely respected living benchmark, for toxicity with the recent version of the API led to a different ranking of widely used foundation models. We suggest caution in applying apples-to-apples comparisons between studies and call for a more structured approach to evaluating toxicity over time.
%R 10.18653/v1/2023.emnlp-main.472
%U https://aclanthology.org/2023.emnlp-main.472
%U https://doi.org/10.18653/v1/2023.emnlp-main.472
%P 7595-7609
Markdown (Informal)
[On the Challenges of Using Black-Box APIs for Toxicity Evaluation in Research](https://aclanthology.org/2023.emnlp-main.472) (Pozzobon et al., EMNLP 2023)
ACL