@inproceedings{shukla-etal-2025-silencing,
title = "Silencing Empowerment, Allowing Bigotry: Auditing the Moderation of Hate Speech on Twitch",
author = "Shukla, Prarabdh and
Chong, Wei Yin and
Patel, Yash and
Schaffner, Brennan and
Pruthi, Danish and
Bhagoji, Arjun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1110/",
doi = "10.18653/v1/2025.acl-long.1110",
pages = "22771--22797",
ISBN = "979-8-89176-251-0",
abstract = "To meet the demands of content moderation, online platforms have resorted to automated systems. Newer forms of real-time engagement ($\textit{e.g.}$, users commenting on live streams) on platforms like Twitch exert additional pressures on the latency expected of such moderation systems. Despite their prevalence, relatively little is known about the effectiveness of these systems. In this paper, we conduct an audit of Twitch{'}s automated moderation tool ($\texttt{AutoMod}$) to investigate its effectiveness in flagging hateful content. For our audit, we create streaming accounts to act as siloed test beds, and interface with the live chat using Twitch{'}s APIs to send over 107,000 comments collated from 4 datasets. We measure $\texttt{AutoMod}${`}s accuracy in flagging blatantly hateful content containing misogyny, racism, ableism and homophobia. Our experiments reveal that a large fraction of hateful messages, up to 94{\%} on some datasets, $\text{\textit{bypass moderation}}$. Contextual addition of slurs to these messages results in 100{\%} removal, revealing $\texttt{AutoMod}${`}s reliance on slurs as a hate signal. We also find that contrary to Twitch{'}s community guidelines, $\texttt{AutoMod}$ blocks up to 89.5{\%} of benign examples that use sensitive words in pedagogical or empowering contexts. Overall, our audit points to large gaps in $\texttt{AutoMod}${`}s capabilities and underscores the importance for such systems to understand context effectively."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shukla-etal-2025-silencing">
<titleInfo>
<title>Silencing Empowerment, Allowing Bigotry: Auditing the Moderation of Hate Speech on Twitch</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prarabdh</namePart>
<namePart type="family">Shukla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="given">Yin</namePart>
<namePart type="family">Chong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yash</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brennan</namePart>
<namePart type="family">Schaffner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danish</namePart>
<namePart type="family">Pruthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Bhagoji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>To meet the demands of content moderation, online platforms have resorted to automated systems. Newer forms of real-time engagement (e.g., users commenting on live streams) on platforms like Twitch exert additional pressures on the latency expected of such moderation systems. Despite their prevalence, relatively little is known about the effectiveness of these systems. In this paper, we conduct an audit of Twitch’s automated moderation tool (AutoMod) to investigate its effectiveness in flagging hateful content. For our audit, we create streaming accounts to act as siloed test beds, and interface with the live chat using Twitch’s APIs to send over 107,000 comments collated from 4 datasets. We measure AutoMod‘s accuracy in flagging blatantly hateful content containing misogyny, racism, ableism and homophobia. Our experiments reveal that a large fraction of hateful messages, up to 94% on some datasets, \textbypass moderation. Contextual addition of slurs to these messages results in 100% removal, revealing AutoMod‘s reliance on slurs as a hate signal. We also find that contrary to Twitch’s community guidelines, AutoMod blocks up to 89.5% of benign examples that use sensitive words in pedagogical or empowering contexts. Overall, our audit points to large gaps in AutoMod‘s capabilities and underscores the importance for such systems to understand context effectively.</abstract>
<identifier type="citekey">shukla-etal-2025-silencing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1110</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1110/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>22771</start>
<end>22797</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Silencing Empowerment, Allowing Bigotry: Auditing the Moderation of Hate Speech on Twitch
%A Shukla, Prarabdh
%A Chong, Wei Yin
%A Patel, Yash
%A Schaffner, Brennan
%A Pruthi, Danish
%A Bhagoji, Arjun
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F shukla-etal-2025-silencing
%X To meet the demands of content moderation, online platforms have resorted to automated systems. Newer forms of real-time engagement (e.g., users commenting on live streams) on platforms like Twitch exert additional pressures on the latency expected of such moderation systems. Despite their prevalence, relatively little is known about the effectiveness of these systems. In this paper, we conduct an audit of Twitch’s automated moderation tool (AutoMod) to investigate its effectiveness in flagging hateful content. For our audit, we create streaming accounts to act as siloed test beds, and interface with the live chat using Twitch’s APIs to send over 107,000 comments collated from 4 datasets. We measure AutoMod‘s accuracy in flagging blatantly hateful content containing misogyny, racism, ableism and homophobia. Our experiments reveal that a large fraction of hateful messages, up to 94% on some datasets, \textbypass moderation. Contextual addition of slurs to these messages results in 100% removal, revealing AutoMod‘s reliance on slurs as a hate signal. We also find that contrary to Twitch’s community guidelines, AutoMod blocks up to 89.5% of benign examples that use sensitive words in pedagogical or empowering contexts. Overall, our audit points to large gaps in AutoMod‘s capabilities and underscores the importance for such systems to understand context effectively.
%R 10.18653/v1/2025.acl-long.1110
%U https://aclanthology.org/2025.acl-long.1110/
%U https://doi.org/10.18653/v1/2025.acl-long.1110
%P 22771-22797
Markdown (Informal)
[Silencing Empowerment, Allowing Bigotry: Auditing the Moderation of Hate Speech on Twitch](https://aclanthology.org/2025.acl-long.1110/) (Shukla et al., ACL 2025)
ACL