@inproceedings{madhavan-wadhawan-2024-causal,
title = "Causal {ATE} Mitigates Unintended Bias in Controlled Text Generation",
author = "Madhavan, Rahul and
Wadhawan, Kahini",
editor = "Barak, Libby and
Alikhani, Malihe",
booktitle = "Proceedings of the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-1.11",
pages = "130--142",
abstract = "We study attribute control in language models through the method of Causal Average Treatment Effect (Causal ATE). Existing methodsfor the attribute control task in Language Models(LMs) check for the co-occurrence of words in a sentence with the attribute of interest, and control for them. However, spurious correlation of the words with the attribute in the training dataset, can cause models to hallucinate the presence of the attribute when presented with the spurious correlate during inference. We show that the simple perturbation-based method of Causal ATE removes this unintended effect. Specifically, we ground it in the problem of toxicity mitigation, where a significant challenge lies in the inadvertent bias that often emerges towards protected groups post detoxification. We show that this unintended bias can be solved by the use of the Causal ATE metric. We provide experimental validations for our claims and release our code (anonymously) here: [github.com/causalate-mitigates-bias](https://github.com/causalate-mitigates-bias/causal-ate-mitigates-bias).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="madhavan-wadhawan-2024-causal">
<titleInfo>
<title>Causal ATE Mitigates Unintended Bias in Controlled Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Madhavan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kahini</namePart>
<namePart type="family">Wadhawan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Libby</namePart>
<namePart type="family">Barak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We study attribute control in language models through the method of Causal Average Treatment Effect (Causal ATE). Existing methodsfor the attribute control task in Language Models(LMs) check for the co-occurrence of words in a sentence with the attribute of interest, and control for them. However, spurious correlation of the words with the attribute in the training dataset, can cause models to hallucinate the presence of the attribute when presented with the spurious correlate during inference. We show that the simple perturbation-based method of Causal ATE removes this unintended effect. Specifically, we ground it in the problem of toxicity mitigation, where a significant challenge lies in the inadvertent bias that often emerges towards protected groups post detoxification. We show that this unintended bias can be solved by the use of the Causal ATE metric. We provide experimental validations for our claims and release our code (anonymously) here: [github.com/causalate-mitigates-bias](https://github.com/causalate-mitigates-bias/causal-ate-mitigates-bias).</abstract>
<identifier type="citekey">madhavan-wadhawan-2024-causal</identifier>
<location>
<url>https://aclanthology.org/2024.conll-1.11</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>130</start>
<end>142</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Causal ATE Mitigates Unintended Bias in Controlled Text Generation
%A Madhavan, Rahul
%A Wadhawan, Kahini
%Y Barak, Libby
%Y Alikhani, Malihe
%S Proceedings of the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F madhavan-wadhawan-2024-causal
%X We study attribute control in language models through the method of Causal Average Treatment Effect (Causal ATE). Existing methodsfor the attribute control task in Language Models(LMs) check for the co-occurrence of words in a sentence with the attribute of interest, and control for them. However, spurious correlation of the words with the attribute in the training dataset, can cause models to hallucinate the presence of the attribute when presented with the spurious correlate during inference. We show that the simple perturbation-based method of Causal ATE removes this unintended effect. Specifically, we ground it in the problem of toxicity mitigation, where a significant challenge lies in the inadvertent bias that often emerges towards protected groups post detoxification. We show that this unintended bias can be solved by the use of the Causal ATE metric. We provide experimental validations for our claims and release our code (anonymously) here: [github.com/causalate-mitigates-bias](https://github.com/causalate-mitigates-bias/causal-ate-mitigates-bias).
%U https://aclanthology.org/2024.conll-1.11
%P 130-142
Markdown (Informal)
[Causal ATE Mitigates Unintended Bias in Controlled Text Generation](https://aclanthology.org/2024.conll-1.11) (Madhavan & Wadhawan, CoNLL 2024)
ACL