@inproceedings{lutz-etal-2024-local,
title = "Local Contrastive Editing of Gender Stereotypes",
author = "Lutz, Marlene and
Choenni, Rochelle and
Strohmaier, Markus and
Lauscher, Anne",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1197/",
doi = "10.18653/v1/2024.emnlp-main.1197",
pages = "21474--21493",
abstract = "Stereotypical bias encoded in language models (LMs) poses a threat to safe language technology, yet our understanding of how bias manifests in the parameters of LMs remains incomplete. We introduce local contrastive editing that enables the localization and editing of a subset of weights in a target model in relation to a reference model. We deploy this approach to identify and modify subsets of weights that are associated with gender stereotypes in LMs. Through a series of experiments we demonstrate that local contrastive editing can precisely localize and control a small subset ($< 0.5\%$) of weights that encode gender bias. Our work (i) advances our understanding of how stereotypical biases can manifest in the parameter space of LMs and (ii) opens up new avenues for developing parameter-efficient strategies for controlling model properties in a contrastive manner."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lutz-etal-2024-local">
<titleInfo>
<title>Local Contrastive Editing of Gender Stereotypes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlene</namePart>
<namePart type="family">Lutz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rochelle</namePart>
<namePart type="family">Choenni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Strohmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne</namePart>
<namePart type="family">Lauscher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Stereotypical bias encoded in language models (LMs) poses a threat to safe language technology, yet our understanding of how bias manifests in the parameters of LMs remains incomplete. We introduce local contrastive editing that enables the localization and editing of a subset of weights in a target model in relation to a reference model. We deploy this approach to identify and modify subsets of weights that are associated with gender stereotypes in LMs. Through a series of experiments we demonstrate that local contrastive editing can precisely localize and control a small subset (< 0.5%) of weights that encode gender bias. Our work (i) advances our understanding of how stereotypical biases can manifest in the parameter space of LMs and (ii) opens up new avenues for developing parameter-efficient strategies for controlling model properties in a contrastive manner.</abstract>
<identifier type="citekey">lutz-etal-2024-local</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1197</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1197/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21474</start>
<end>21493</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Local Contrastive Editing of Gender Stereotypes
%A Lutz, Marlene
%A Choenni, Rochelle
%A Strohmaier, Markus
%A Lauscher, Anne
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F lutz-etal-2024-local
%X Stereotypical bias encoded in language models (LMs) poses a threat to safe language technology, yet our understanding of how bias manifests in the parameters of LMs remains incomplete. We introduce local contrastive editing that enables the localization and editing of a subset of weights in a target model in relation to a reference model. We deploy this approach to identify and modify subsets of weights that are associated with gender stereotypes in LMs. Through a series of experiments we demonstrate that local contrastive editing can precisely localize and control a small subset (< 0.5%) of weights that encode gender bias. Our work (i) advances our understanding of how stereotypical biases can manifest in the parameter space of LMs and (ii) opens up new avenues for developing parameter-efficient strategies for controlling model properties in a contrastive manner.
%R 10.18653/v1/2024.emnlp-main.1197
%U https://aclanthology.org/2024.emnlp-main.1197/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1197
%P 21474-21493
Markdown (Informal)
[Local Contrastive Editing of Gender Stereotypes](https://aclanthology.org/2024.emnlp-main.1197/) (Lutz et al., EMNLP 2024)
ACL
- Marlene Lutz, Rochelle Choenni, Markus Strohmaier, and Anne Lauscher. 2024. Local Contrastive Editing of Gender Stereotypes. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 21474–21493, Miami, Florida, USA. Association for Computational Linguistics.