@inproceedings{jeyaraj-delany-2025-detecting,
title = "Detecting Gender Stereotypical Language Using Model-agnostic and Model-specific Explanations",
author = "Jeyaraj, Manuela Nayantara and
Delany, Sarah Jane",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.57/",
pages = "481--490",
abstract = "AI models learn gender-stereotypical language from human data. So, understanding how well different explanation techniques capture diverse language features that suggest gender stereotypes in text can be useful in identifying stereotypes that could potentially lead to gender bias. The influential words identified by four explanation techniques (LIME, SHAP, Integrated Gradients (IG) and Attention) in a gender stereotype detection task were compared with words annotated by human evaluators. All techniques emphasized adjectives and verbs related to characteristic traits and gender roles as the most influential words. LIME was best at detecting explicitly gendered words, while SHAP, IG and Attention showed stronger overall alignment and considerable overlap. A combination of these techniques, combining the strengths of model-agnostic and model-specific explanations, performs better at capturing gender-stereotypical language. Extending to hate speech and sentiment prediction tasks, annotator agreement suggests these tasks to be more subjective while explanation techniques can better capture explicit markers in hate speech than the more nuanced gender stereotypes. This research highlights the strengths of different explanation techniques in capturing subjective gender stereotype language in text."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jeyaraj-delany-2025-detecting">
<titleInfo>
<title>Detecting Gender Stereotypical Language Using Model-agnostic and Model-specific Explanations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuela</namePart>
<namePart type="given">Nayantara</namePart>
<namePart type="family">Jeyaraj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="given">Jane</namePart>
<namePart type="family">Delany</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>AI models learn gender-stereotypical language from human data. So, understanding how well different explanation techniques capture diverse language features that suggest gender stereotypes in text can be useful in identifying stereotypes that could potentially lead to gender bias. The influential words identified by four explanation techniques (LIME, SHAP, Integrated Gradients (IG) and Attention) in a gender stereotype detection task were compared with words annotated by human evaluators. All techniques emphasized adjectives and verbs related to characteristic traits and gender roles as the most influential words. LIME was best at detecting explicitly gendered words, while SHAP, IG and Attention showed stronger overall alignment and considerable overlap. A combination of these techniques, combining the strengths of model-agnostic and model-specific explanations, performs better at capturing gender-stereotypical language. Extending to hate speech and sentiment prediction tasks, annotator agreement suggests these tasks to be more subjective while explanation techniques can better capture explicit markers in hate speech than the more nuanced gender stereotypes. This research highlights the strengths of different explanation techniques in capturing subjective gender stereotype language in text.</abstract>
<identifier type="citekey">jeyaraj-delany-2025-detecting</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.57/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>481</start>
<end>490</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting Gender Stereotypical Language Using Model-agnostic and Model-specific Explanations
%A Jeyaraj, Manuela Nayantara
%A Delany, Sarah Jane
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F jeyaraj-delany-2025-detecting
%X AI models learn gender-stereotypical language from human data. So, understanding how well different explanation techniques capture diverse language features that suggest gender stereotypes in text can be useful in identifying stereotypes that could potentially lead to gender bias. The influential words identified by four explanation techniques (LIME, SHAP, Integrated Gradients (IG) and Attention) in a gender stereotype detection task were compared with words annotated by human evaluators. All techniques emphasized adjectives and verbs related to characteristic traits and gender roles as the most influential words. LIME was best at detecting explicitly gendered words, while SHAP, IG and Attention showed stronger overall alignment and considerable overlap. A combination of these techniques, combining the strengths of model-agnostic and model-specific explanations, performs better at capturing gender-stereotypical language. Extending to hate speech and sentiment prediction tasks, annotator agreement suggests these tasks to be more subjective while explanation techniques can better capture explicit markers in hate speech than the more nuanced gender stereotypes. This research highlights the strengths of different explanation techniques in capturing subjective gender stereotype language in text.
%U https://aclanthology.org/2025.ranlp-1.57/
%P 481-490
Markdown (Informal)
[Detecting Gender Stereotypical Language Using Model-agnostic and Model-specific Explanations](https://aclanthology.org/2025.ranlp-1.57/) (Jeyaraj & Delany, RANLP 2025)
ACL