@inproceedings{kiehne-etal-2024-analyzing,
title = "Analyzing Effects of Learning Downstream Tasks on Moral Bias in Large Language Models",
author = {Kiehne, Niklas and
Ljapunov, Alexander and
B{\"a}tje, Marc and
Balke, Wolf-Tilo},
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.82",
pages = "904--923",
abstract = "Pre-training and fine-tuning large language models (LMs) is currently the state-of-the-art methodology for enabling data-scarce downstream tasks. However, the derived models still tend to replicate and perpetuate social biases. To understand this process in more detail, this paper investigates the actual effects of learning downstream tasks on moral bias in LMs. We develop methods to assess the agreement of LMs to explicitly codified norms in both pre-training and fine-tuning stages. Even if a pre-trained foundation model exhibits consistent norms, we find that introducing downstream tasks may indeed lead to unexpected inconsistencies in norm representation. Specifically, we observe two phenomena during fine-tuning across both masked and causal LMs: (1) pre-existing moral bias may be mitigated or amplified even when presented with opposing views and (2) prompt sensitivity may be negatively impacted. We provide empirical evidence of models deteriorating into conflicting states, where contradictory answers can easily be triggered by slight modifications in the input sequence. Our findings thus raise concerns about the general ability of LMs to mitigate moral biases effectively.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kiehne-etal-2024-analyzing">
<titleInfo>
<title>Analyzing Effects of Learning Downstream Tasks on Moral Bias in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Niklas</namePart>
<namePart type="family">Kiehne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Ljapunov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Bätje</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wolf-Tilo</namePart>
<namePart type="family">Balke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-training and fine-tuning large language models (LMs) is currently the state-of-the-art methodology for enabling data-scarce downstream tasks. However, the derived models still tend to replicate and perpetuate social biases. To understand this process in more detail, this paper investigates the actual effects of learning downstream tasks on moral bias in LMs. We develop methods to assess the agreement of LMs to explicitly codified norms in both pre-training and fine-tuning stages. Even if a pre-trained foundation model exhibits consistent norms, we find that introducing downstream tasks may indeed lead to unexpected inconsistencies in norm representation. Specifically, we observe two phenomena during fine-tuning across both masked and causal LMs: (1) pre-existing moral bias may be mitigated or amplified even when presented with opposing views and (2) prompt sensitivity may be negatively impacted. We provide empirical evidence of models deteriorating into conflicting states, where contradictory answers can easily be triggered by slight modifications in the input sequence. Our findings thus raise concerns about the general ability of LMs to mitigate moral biases effectively.</abstract>
<identifier type="citekey">kiehne-etal-2024-analyzing</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.82</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>904</start>
<end>923</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analyzing Effects of Learning Downstream Tasks on Moral Bias in Large Language Models
%A Kiehne, Niklas
%A Ljapunov, Alexander
%A Bätje, Marc
%A Balke, Wolf-Tilo
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F kiehne-etal-2024-analyzing
%X Pre-training and fine-tuning large language models (LMs) is currently the state-of-the-art methodology for enabling data-scarce downstream tasks. However, the derived models still tend to replicate and perpetuate social biases. To understand this process in more detail, this paper investigates the actual effects of learning downstream tasks on moral bias in LMs. We develop methods to assess the agreement of LMs to explicitly codified norms in both pre-training and fine-tuning stages. Even if a pre-trained foundation model exhibits consistent norms, we find that introducing downstream tasks may indeed lead to unexpected inconsistencies in norm representation. Specifically, we observe two phenomena during fine-tuning across both masked and causal LMs: (1) pre-existing moral bias may be mitigated or amplified even when presented with opposing views and (2) prompt sensitivity may be negatively impacted. We provide empirical evidence of models deteriorating into conflicting states, where contradictory answers can easily be triggered by slight modifications in the input sequence. Our findings thus raise concerns about the general ability of LMs to mitigate moral biases effectively.
%U https://aclanthology.org/2024.lrec-main.82
%P 904-923
Markdown (Informal)
[Analyzing Effects of Learning Downstream Tasks on Moral Bias in Large Language Models](https://aclanthology.org/2024.lrec-main.82) (Kiehne et al., LREC-COLING 2024)
ACL