@inproceedings{gubelmann-karray-2025-assessing,
title = "Assessing Reliability and Political Bias In {LLM}s' Judgements of Formal and Material Inferences With Partisan Conclusions",
author = "Gubelmann, Reto and
Karray, Ghassen",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1450/",
doi = "10.18653/v1/2025.acl-long.1450",
pages = "30005--30031",
ISBN = "979-8-89176-251-0",
abstract = "This article examines LLMs' ability to correctly label simple inferences with partisan conclusions. For this, we develop a dataset with both formal and material inferences, containing logically equivalent pairs of inferences with conclusions that favor either the political left or the political right. This allows us to focus on political bias as a source of decrease in performance. Our samples are synthetically generated and thus highly controlled, covering both English and German. We assess the performance of 16 configurations of both open and proprietary state-of-the-art LLMs on that dataset, finding generally unreliable performance as well as widespread political bias which, in the case of the English samples, persists throughout our experimental settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gubelmann-karray-2025-assessing">
<titleInfo>
<title>Assessing Reliability and Political Bias In LLMs’ Judgements of Formal and Material Inferences With Partisan Conclusions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reto</namePart>
<namePart type="family">Gubelmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ghassen</namePart>
<namePart type="family">Karray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>This article examines LLMs’ ability to correctly label simple inferences with partisan conclusions. For this, we develop a dataset with both formal and material inferences, containing logically equivalent pairs of inferences with conclusions that favor either the political left or the political right. This allows us to focus on political bias as a source of decrease in performance. Our samples are synthetically generated and thus highly controlled, covering both English and German. We assess the performance of 16 configurations of both open and proprietary state-of-the-art LLMs on that dataset, finding generally unreliable performance as well as widespread political bias which, in the case of the English samples, persists throughout our experimental settings.</abstract>
<identifier type="citekey">gubelmann-karray-2025-assessing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1450</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1450/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>30005</start>
<end>30031</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing Reliability and Political Bias In LLMs’ Judgements of Formal and Material Inferences With Partisan Conclusions
%A Gubelmann, Reto
%A Karray, Ghassen
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F gubelmann-karray-2025-assessing
%X This article examines LLMs’ ability to correctly label simple inferences with partisan conclusions. For this, we develop a dataset with both formal and material inferences, containing logically equivalent pairs of inferences with conclusions that favor either the political left or the political right. This allows us to focus on political bias as a source of decrease in performance. Our samples are synthetically generated and thus highly controlled, covering both English and German. We assess the performance of 16 configurations of both open and proprietary state-of-the-art LLMs on that dataset, finding generally unreliable performance as well as widespread political bias which, in the case of the English samples, persists throughout our experimental settings.
%R 10.18653/v1/2025.acl-long.1450
%U https://aclanthology.org/2025.acl-long.1450/
%U https://doi.org/10.18653/v1/2025.acl-long.1450
%P 30005-30031
Markdown (Informal)
[Assessing Reliability and Political Bias In LLMs’ Judgements of Formal and Material Inferences With Partisan Conclusions](https://aclanthology.org/2025.acl-long.1450/) (Gubelmann & Karray, ACL 2025)
ACL