@inproceedings{herrera-berg-etal-2023-large,
title = "Large Language Models are biased to overestimate profoundness",
author = "Herrera-Berg, Eugenio and
Browne, Tom{\'a}s and
Le{\'o}n-Villagr{\'a}, Pablo and
Vives, Marc-Llu{\'\i}s and
Calderon, Cristian",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.599",
doi = "10.18653/v1/2023.emnlp-main.599",
pages = "9653--9661",
abstract = "Recent advancements in natural language processing by large language models (LLMs), such as GPT-4, have been suggested to approach Artificial General Intelligence. And yet, it is still under dispute whether LLMs possess similar reasoning abilities to humans. This study evaluates GPT-4 and various other LLMs in judging the profoundness of mundane, motivational, and pseudo-profound statements. We found a significant statement-to-statement correlation between the LLMs and humans, irrespective of the type of statements and the prompting technique used. However, LLMs systematically overestimate the profoundness of nonsensical statements, with the exception of Tk-instruct, which uniquely underestimates the profoundness of statements. Only few-shot learning prompts, as opposed to chain-of-thought prompting, draw LLMs ratings closer to humans. Furthermore, this work provides insights into the potential biases induced by Reinforcement Learning from Human Feedback (RLHF), inducing an increase in the bias to overestimate the profoundness of statements.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="herrera-berg-etal-2023-large">
<titleInfo>
<title>Large Language Models are biased to overestimate profoundness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eugenio</namePart>
<namePart type="family">Herrera-Berg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomás</namePart>
<namePart type="family">Browne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="family">León-Villagrá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc-Lluís</namePart>
<namePart type="family">Vives</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristian</namePart>
<namePart type="family">Calderon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in natural language processing by large language models (LLMs), such as GPT-4, have been suggested to approach Artificial General Intelligence. And yet, it is still under dispute whether LLMs possess similar reasoning abilities to humans. This study evaluates GPT-4 and various other LLMs in judging the profoundness of mundane, motivational, and pseudo-profound statements. We found a significant statement-to-statement correlation between the LLMs and humans, irrespective of the type of statements and the prompting technique used. However, LLMs systematically overestimate the profoundness of nonsensical statements, with the exception of Tk-instruct, which uniquely underestimates the profoundness of statements. Only few-shot learning prompts, as opposed to chain-of-thought prompting, draw LLMs ratings closer to humans. Furthermore, this work provides insights into the potential biases induced by Reinforcement Learning from Human Feedback (RLHF), inducing an increase in the bias to overestimate the profoundness of statements.</abstract>
<identifier type="citekey">herrera-berg-etal-2023-large</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.599</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.599</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>9653</start>
<end>9661</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Language Models are biased to overestimate profoundness
%A Herrera-Berg, Eugenio
%A Browne, Tomás
%A León-Villagrá, Pablo
%A Vives, Marc-Lluís
%A Calderon, Cristian
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F herrera-berg-etal-2023-large
%X Recent advancements in natural language processing by large language models (LLMs), such as GPT-4, have been suggested to approach Artificial General Intelligence. And yet, it is still under dispute whether LLMs possess similar reasoning abilities to humans. This study evaluates GPT-4 and various other LLMs in judging the profoundness of mundane, motivational, and pseudo-profound statements. We found a significant statement-to-statement correlation between the LLMs and humans, irrespective of the type of statements and the prompting technique used. However, LLMs systematically overestimate the profoundness of nonsensical statements, with the exception of Tk-instruct, which uniquely underestimates the profoundness of statements. Only few-shot learning prompts, as opposed to chain-of-thought prompting, draw LLMs ratings closer to humans. Furthermore, this work provides insights into the potential biases induced by Reinforcement Learning from Human Feedback (RLHF), inducing an increase in the bias to overestimate the profoundness of statements.
%R 10.18653/v1/2023.emnlp-main.599
%U https://aclanthology.org/2023.emnlp-main.599
%U https://doi.org/10.18653/v1/2023.emnlp-main.599
%P 9653-9661
Markdown (Informal)
[Large Language Models are biased to overestimate profoundness](https://aclanthology.org/2023.emnlp-main.599) (Herrera-Berg et al., EMNLP 2023)
ACL
- Eugenio Herrera-Berg, Tomás Browne, Pablo León-Villagrá, Marc-Lluís Vives, and Cristian Calderon. 2023. Large Language Models are biased to overestimate profoundness. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 9653–9661, Singapore. Association for Computational Linguistics.