@inproceedings{pachinger-etal-2025-disaggregated,
title = "A Disaggregated Dataset on {E}nglish Offensiveness Containing Spans",
author = "Pachinger, Pia and
Goldzycher, Janis and
Planitzer, Anna M. and
Neidhardt, Julia and
Hanbury, Allan",
editor = "Abercrombie, Gavin and
Basile, Valerio and
Frenda, Simona and
Tonelli, Sara and
Dudy, Shiran",
booktitle = "Proceedings of the The 4th Workshop on Perspectivist Approaches to NLP",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nlperspectives-1.1/",
pages = "1--14",
ISBN = "979-8-89176-350-0",
abstract = "Toxicity labels at sub-document granularity and disaggregated labels lead to more nuanced and personalized toxicity classification and facilitate analysis. We re-annotate a subset of 1983 posts of the Jigsaw Toxic Comment Classification Challenge and provide disaggregated toxicity labels and spans that identify inappropriate language and targets of toxic statements. Manual analysis shows that five annotations per instance effectively capture meaningful disagreement patterns and allow for finer distinctions between genuine disagreement and that arising from annotation error or inconsistency. Our main findings are: (1) Disagreement often stems from divergent interpretations of edge-case toxicity (2) Disagreement is especially high in cases of toxic statements involving non-human targets (3) Disagreement on whether a passage consists of inappropriate language occurs not only on inherently questionable terms, but also on words that may be inappropriate in specific contexts while remaining acceptable in others (4) Transformer-based models effectively learn from aggregated data that reduces false negative classifications by being more sensitive towards minority opinions for posts to be toxic. We publish the new annotations under the CC BY 4.0 license."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pachinger-etal-2025-disaggregated">
<titleInfo>
<title>A Disaggregated Dataset on English Offensiveness Containing Spans</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pia</namePart>
<namePart type="family">Pachinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Goldzycher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Planitzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Neidhardt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Allan</namePart>
<namePart type="family">Hanbury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the The 4th Workshop on Perspectivist Approaches to NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gavin</namePart>
<namePart type="family">Abercrombie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valerio</namePart>
<namePart type="family">Basile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simona</namePart>
<namePart type="family">Frenda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Tonelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiran</namePart>
<namePart type="family">Dudy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-350-0</identifier>
</relatedItem>
<abstract>Toxicity labels at sub-document granularity and disaggregated labels lead to more nuanced and personalized toxicity classification and facilitate analysis. We re-annotate a subset of 1983 posts of the Jigsaw Toxic Comment Classification Challenge and provide disaggregated toxicity labels and spans that identify inappropriate language and targets of toxic statements. Manual analysis shows that five annotations per instance effectively capture meaningful disagreement patterns and allow for finer distinctions between genuine disagreement and that arising from annotation error or inconsistency. Our main findings are: (1) Disagreement often stems from divergent interpretations of edge-case toxicity (2) Disagreement is especially high in cases of toxic statements involving non-human targets (3) Disagreement on whether a passage consists of inappropriate language occurs not only on inherently questionable terms, but also on words that may be inappropriate in specific contexts while remaining acceptable in others (4) Transformer-based models effectively learn from aggregated data that reduces false negative classifications by being more sensitive towards minority opinions for posts to be toxic. We publish the new annotations under the CC BY 4.0 license.</abstract>
<identifier type="citekey">pachinger-etal-2025-disaggregated</identifier>
<location>
<url>https://aclanthology.org/2025.nlperspectives-1.1/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Disaggregated Dataset on English Offensiveness Containing Spans
%A Pachinger, Pia
%A Goldzycher, Janis
%A Planitzer, Anna M.
%A Neidhardt, Julia
%A Hanbury, Allan
%Y Abercrombie, Gavin
%Y Basile, Valerio
%Y Frenda, Simona
%Y Tonelli, Sara
%Y Dudy, Shiran
%S Proceedings of the The 4th Workshop on Perspectivist Approaches to NLP
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-350-0
%F pachinger-etal-2025-disaggregated
%X Toxicity labels at sub-document granularity and disaggregated labels lead to more nuanced and personalized toxicity classification and facilitate analysis. We re-annotate a subset of 1983 posts of the Jigsaw Toxic Comment Classification Challenge and provide disaggregated toxicity labels and spans that identify inappropriate language and targets of toxic statements. Manual analysis shows that five annotations per instance effectively capture meaningful disagreement patterns and allow for finer distinctions between genuine disagreement and that arising from annotation error or inconsistency. Our main findings are: (1) Disagreement often stems from divergent interpretations of edge-case toxicity (2) Disagreement is especially high in cases of toxic statements involving non-human targets (3) Disagreement on whether a passage consists of inappropriate language occurs not only on inherently questionable terms, but also on words that may be inappropriate in specific contexts while remaining acceptable in others (4) Transformer-based models effectively learn from aggregated data that reduces false negative classifications by being more sensitive towards minority opinions for posts to be toxic. We publish the new annotations under the CC BY 4.0 license.
%U https://aclanthology.org/2025.nlperspectives-1.1/
%P 1-14
Markdown (Informal)
[A Disaggregated Dataset on English Offensiveness Containing Spans](https://aclanthology.org/2025.nlperspectives-1.1/) (Pachinger et al., NLPerspectives 2025)
ACL