@inproceedings{xie-etal-2024-ask,
title = "Ask Again, Then Fail: Large Language Models' Vacillations in Judgment",
author = "Xie, Qiming and
Wang, Zengzhi and
Feng, Yi and
Xia, Rui",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.577/",
doi = "10.18653/v1/2024.acl-long.577",
pages = "10709--10745",
abstract = "We observe that current large language models often waver in their judgments when faced with follow-up questions, even if the original judgment was correct. This wavering presents a significant challenge for generating reliable responses and building user trust. To comprehensively assess this issue, we introduce a Follow-up Questioning Mechanism along with two metrics to quantify this inconsistency, confirming its widespread presence in current large language models. Furthermore, to mitigate this issue, we explore various prompting strategies for closed-source models, and develop a training-based framework Unwavering-FQ that teaches large language models to maintain their originally correct judgments through synthesized high-quality preference data. Our experimental results confirm the effectiveness of our framework and its ability to enhance the general capabilities of large language models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xie-etal-2024-ask">
<titleInfo>
<title>Ask Again, Then Fail: Large Language Models’ Vacillations in Judgment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qiming</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zengzhi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We observe that current large language models often waver in their judgments when faced with follow-up questions, even if the original judgment was correct. This wavering presents a significant challenge for generating reliable responses and building user trust. To comprehensively assess this issue, we introduce a Follow-up Questioning Mechanism along with two metrics to quantify this inconsistency, confirming its widespread presence in current large language models. Furthermore, to mitigate this issue, we explore various prompting strategies for closed-source models, and develop a training-based framework Unwavering-FQ that teaches large language models to maintain their originally correct judgments through synthesized high-quality preference data. Our experimental results confirm the effectiveness of our framework and its ability to enhance the general capabilities of large language models.</abstract>
<identifier type="citekey">xie-etal-2024-ask</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.577</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.577/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>10709</start>
<end>10745</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ask Again, Then Fail: Large Language Models’ Vacillations in Judgment
%A Xie, Qiming
%A Wang, Zengzhi
%A Feng, Yi
%A Xia, Rui
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F xie-etal-2024-ask
%X We observe that current large language models often waver in their judgments when faced with follow-up questions, even if the original judgment was correct. This wavering presents a significant challenge for generating reliable responses and building user trust. To comprehensively assess this issue, we introduce a Follow-up Questioning Mechanism along with two metrics to quantify this inconsistency, confirming its widespread presence in current large language models. Furthermore, to mitigate this issue, we explore various prompting strategies for closed-source models, and develop a training-based framework Unwavering-FQ that teaches large language models to maintain their originally correct judgments through synthesized high-quality preference data. Our experimental results confirm the effectiveness of our framework and its ability to enhance the general capabilities of large language models.
%R 10.18653/v1/2024.acl-long.577
%U https://aclanthology.org/2024.luhme-long.577/
%U https://doi.org/10.18653/v1/2024.acl-long.577
%P 10709-10745
Markdown (Informal)
[Ask Again, Then Fail: Large Language Models’ Vacillations in Judgment](https://aclanthology.org/2024.luhme-long.577/) (Xie et al., ACL 2024)
ACL