@inproceedings{fulay-etal-2024-relationship,
title = "On the Relationship between Truth and Political Bias in Language Models",
author = "Fulay, Suyash and
Brannon, William and
Mohanty, Shrestha and
Overney, Cassandra and
Poole-Dayan, Elinor and
Roy, Deb and
Kabbara, Jad",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.508",
pages = "9004--9018",
abstract = "Language model alignment research often attempts to ensure that models are not only helpful and harmless, but also truthful and unbiased. However, optimizing these objectives simultaneously can obscure how improving one aspect might impact the others. In this work, we focus on analyzing the relationship between two concepts essential in both language model alignment and political science: truthfulness and political bias. We train reward models on various popular truthfulness datasets and subsequently evaluate their political bias. Our findings reveal that optimizing reward models for truthfulness on these datasets tends to result in a left-leaning political bias. We also find that existing open-source reward models (i.e., those trained on standard human preference datasets) already show a similar bias and that the bias is larger for larger models. These results raise important questions about the datasets used to represent truthfulness, potential limitations of aligning models to be both truthful and politically unbiased, and what language models capture about the relationship between truth and politics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fulay-etal-2024-relationship">
<titleInfo>
<title>On the Relationship between Truth and Political Bias in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Suyash</namePart>
<namePart type="family">Fulay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Brannon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shrestha</namePart>
<namePart type="family">Mohanty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cassandra</namePart>
<namePart type="family">Overney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elinor</namePart>
<namePart type="family">Poole-Dayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deb</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language model alignment research often attempts to ensure that models are not only helpful and harmless, but also truthful and unbiased. However, optimizing these objectives simultaneously can obscure how improving one aspect might impact the others. In this work, we focus on analyzing the relationship between two concepts essential in both language model alignment and political science: truthfulness and political bias. We train reward models on various popular truthfulness datasets and subsequently evaluate their political bias. Our findings reveal that optimizing reward models for truthfulness on these datasets tends to result in a left-leaning political bias. We also find that existing open-source reward models (i.e., those trained on standard human preference datasets) already show a similar bias and that the bias is larger for larger models. These results raise important questions about the datasets used to represent truthfulness, potential limitations of aligning models to be both truthful and politically unbiased, and what language models capture about the relationship between truth and politics.</abstract>
<identifier type="citekey">fulay-etal-2024-relationship</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.508</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>9004</start>
<end>9018</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Relationship between Truth and Political Bias in Language Models
%A Fulay, Suyash
%A Brannon, William
%A Mohanty, Shrestha
%A Overney, Cassandra
%A Poole-Dayan, Elinor
%A Roy, Deb
%A Kabbara, Jad
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F fulay-etal-2024-relationship
%X Language model alignment research often attempts to ensure that models are not only helpful and harmless, but also truthful and unbiased. However, optimizing these objectives simultaneously can obscure how improving one aspect might impact the others. In this work, we focus on analyzing the relationship between two concepts essential in both language model alignment and political science: truthfulness and political bias. We train reward models on various popular truthfulness datasets and subsequently evaluate their political bias. Our findings reveal that optimizing reward models for truthfulness on these datasets tends to result in a left-leaning political bias. We also find that existing open-source reward models (i.e., those trained on standard human preference datasets) already show a similar bias and that the bias is larger for larger models. These results raise important questions about the datasets used to represent truthfulness, potential limitations of aligning models to be both truthful and politically unbiased, and what language models capture about the relationship between truth and politics.
%U https://aclanthology.org/2024.emnlp-main.508
%P 9004-9018
Markdown (Informal)
[On the Relationship between Truth and Political Bias in Language Models](https://aclanthology.org/2024.emnlp-main.508) (Fulay et al., EMNLP 2024)
ACL
- Suyash Fulay, William Brannon, Shrestha Mohanty, Cassandra Overney, Elinor Poole-Dayan, Deb Roy, and Jad Kabbara. 2024. On the Relationship between Truth and Political Bias in Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 9004–9018, Miami, Florida, USA. Association for Computational Linguistics.