@inproceedings{ousidhoum-etal-2021-probing,
title = "Probing Toxic Content in Large Pre-Trained Language Models",
author = "Ousidhoum, Nedjma and
Zhao, Xinran and
Fang, Tianqing and
Song, Yangqiu and
Yeung, Dit-Yan",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.329",
doi = "10.18653/v1/2021.acl-long.329",
pages = "4262--4274",
abstract = "Large pre-trained language models (PTLMs) have been shown to carry biases towards different social groups which leads to the reproduction of stereotypical and toxic content by major NLP systems. We propose a method based on logistic regression classifiers to probe English, French, and Arabic PTLMs and quantify the potentially harmful content that they convey with respect to a set of templates. The templates are prompted by a name of a social group followed by a cause-effect relation. We use PTLMs to predict masked tokens at the end of a sentence in order to examine how likely they enable toxicity towards specific communities. We shed the light on how such negative content can be triggered within unrelated and benign contexts based on evidence from a large-scale study, then we explain how to take advantage of our methodology to assess and mitigate the toxicity transmitted by PTLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ousidhoum-etal-2021-probing">
<titleInfo>
<title>Probing Toxic Content in Large Pre-Trained Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nedjma</namePart>
<namePart type="family">Ousidhoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinran</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianqing</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangqiu</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dit-Yan</namePart>
<namePart type="family">Yeung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large pre-trained language models (PTLMs) have been shown to carry biases towards different social groups which leads to the reproduction of stereotypical and toxic content by major NLP systems. We propose a method based on logistic regression classifiers to probe English, French, and Arabic PTLMs and quantify the potentially harmful content that they convey with respect to a set of templates. The templates are prompted by a name of a social group followed by a cause-effect relation. We use PTLMs to predict masked tokens at the end of a sentence in order to examine how likely they enable toxicity towards specific communities. We shed the light on how such negative content can be triggered within unrelated and benign contexts based on evidence from a large-scale study, then we explain how to take advantage of our methodology to assess and mitigate the toxicity transmitted by PTLMs.</abstract>
<identifier type="citekey">ousidhoum-etal-2021-probing</identifier>
<identifier type="doi">10.18653/v1/2021.acl-long.329</identifier>
<location>
<url>https://aclanthology.org/2021.acl-long.329</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>4262</start>
<end>4274</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing Toxic Content in Large Pre-Trained Language Models
%A Ousidhoum, Nedjma
%A Zhao, Xinran
%A Fang, Tianqing
%A Song, Yangqiu
%A Yeung, Dit-Yan
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F ousidhoum-etal-2021-probing
%X Large pre-trained language models (PTLMs) have been shown to carry biases towards different social groups which leads to the reproduction of stereotypical and toxic content by major NLP systems. We propose a method based on logistic regression classifiers to probe English, French, and Arabic PTLMs and quantify the potentially harmful content that they convey with respect to a set of templates. The templates are prompted by a name of a social group followed by a cause-effect relation. We use PTLMs to predict masked tokens at the end of a sentence in order to examine how likely they enable toxicity towards specific communities. We shed the light on how such negative content can be triggered within unrelated and benign contexts based on evidence from a large-scale study, then we explain how to take advantage of our methodology to assess and mitigate the toxicity transmitted by PTLMs.
%R 10.18653/v1/2021.acl-long.329
%U https://aclanthology.org/2021.acl-long.329
%U https://doi.org/10.18653/v1/2021.acl-long.329
%P 4262-4274
Markdown (Informal)
[Probing Toxic Content in Large Pre-Trained Language Models](https://aclanthology.org/2021.acl-long.329) (Ousidhoum et al., ACL-IJCNLP 2021)
ACL
- Nedjma Ousidhoum, Xinran Zhao, Tianqing Fang, Yangqiu Song, and Dit-Yan Yeung. 2021. Probing Toxic Content in Large Pre-Trained Language Models. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pages 4262–4274, Online. Association for Computational Linguistics.