@inproceedings{moore-etal-2024-large,
title = "Are Large Language Models Consistent over Value-laden Questions?",
author = "Moore, Jared and
Deshpande, Tanvi and
Yang, Diyi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.891",
pages = "15185--15221",
abstract = "Large language models (LLMs) appear to bias their survey answers toward certain values. Nonetheless, some argue that LLMs are too inconsistent to simulate particular values. Are they? To answer, we first define value consistency as the similarity of answers across 1) \textit{paraphrases} of one question, 2) related questions under one \textit{topic}, 3) multiple-choice and open-ended \textit{use-cases} of one question, and 4) \textit{multilingual} translations of a question to English, Chinese, German, and Japanese. We apply these measures to a few large, open LLMs including llama-3, as well as gpt-4o, using eight thousand questions spanning more than 300 topics. Unlike prior work, we find that \textit{models are relatively consistent} across paraphrases, use-cases, translations, and within a topic. Still, some inconsistencies remain. Models are more consistent on uncontroversial topics (e.g., in the U.S., {``}Thanksgiving{''}) than on controversial ones (e.g. {``}euthanasia{''}). Base models are both more consistent compared to fine-tuned models and are uniform in their consistency across topics, while fine-tuned models are more inconsistent about some topics (e.g. {``}euthanasia{''}) than others (e.g. {``}Women{'}s rights{''}) like our human participants.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moore-etal-2024-large">
<titleInfo>
<title>Are Large Language Models Consistent over Value-laden Questions?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jared</namePart>
<namePart type="family">Moore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanvi</namePart>
<namePart type="family">Deshpande</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) appear to bias their survey answers toward certain values. Nonetheless, some argue that LLMs are too inconsistent to simulate particular values. Are they? To answer, we first define value consistency as the similarity of answers across 1) paraphrases of one question, 2) related questions under one topic, 3) multiple-choice and open-ended use-cases of one question, and 4) multilingual translations of a question to English, Chinese, German, and Japanese. We apply these measures to a few large, open LLMs including llama-3, as well as gpt-4o, using eight thousand questions spanning more than 300 topics. Unlike prior work, we find that models are relatively consistent across paraphrases, use-cases, translations, and within a topic. Still, some inconsistencies remain. Models are more consistent on uncontroversial topics (e.g., in the U.S., “Thanksgiving”) than on controversial ones (e.g. “euthanasia”). Base models are both more consistent compared to fine-tuned models and are uniform in their consistency across topics, while fine-tuned models are more inconsistent about some topics (e.g. “euthanasia”) than others (e.g. “Women’s rights”) like our human participants.</abstract>
<identifier type="citekey">moore-etal-2024-large</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.891</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>15185</start>
<end>15221</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are Large Language Models Consistent over Value-laden Questions?
%A Moore, Jared
%A Deshpande, Tanvi
%A Yang, Diyi
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F moore-etal-2024-large
%X Large language models (LLMs) appear to bias their survey answers toward certain values. Nonetheless, some argue that LLMs are too inconsistent to simulate particular values. Are they? To answer, we first define value consistency as the similarity of answers across 1) paraphrases of one question, 2) related questions under one topic, 3) multiple-choice and open-ended use-cases of one question, and 4) multilingual translations of a question to English, Chinese, German, and Japanese. We apply these measures to a few large, open LLMs including llama-3, as well as gpt-4o, using eight thousand questions spanning more than 300 topics. Unlike prior work, we find that models are relatively consistent across paraphrases, use-cases, translations, and within a topic. Still, some inconsistencies remain. Models are more consistent on uncontroversial topics (e.g., in the U.S., “Thanksgiving”) than on controversial ones (e.g. “euthanasia”). Base models are both more consistent compared to fine-tuned models and are uniform in their consistency across topics, while fine-tuned models are more inconsistent about some topics (e.g. “euthanasia”) than others (e.g. “Women’s rights”) like our human participants.
%U https://aclanthology.org/2024.findings-emnlp.891
%P 15185-15221
Markdown (Informal)
[Are Large Language Models Consistent over Value-laden Questions?](https://aclanthology.org/2024.findings-emnlp.891) (Moore et al., Findings 2024)
ACL