@inproceedings{koopman-zuccon-2023-dr,
title = "Dr {C}hat{GPT} tell me what {I} want to hear: How different prompts impact health answer correctness",
author = "Koopman, Bevan and
Zuccon, Guido",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.928",
doi = "10.18653/v1/2023.emnlp-main.928",
pages = "15012--15022",
abstract = "This paper investigates the significant impact different prompts have on the behaviour of ChatGPT when used for health information seeking. As people more and more depend on generative large language models (LLMs) like ChatGPT, it is critical to understand model behaviour under different conditions, especially for domains where incorrect answers can have serious consequences such as health. Using the TREC Misinformation dataset, we empirically evaluate ChatGPT to show not just its effectiveness but reveal that knowledge passed in the prompt can bias the model to the detriment of answer correctness. We show this occurs both for retrieve-then-generate pipelines and based on how a user phrases their question as well as the question type. This work has important implications for the development of more robust and transparent question-answering systems based on generative large language models. Prompts, raw result files and manual analysis are made publicly available at \url{https://github.com/ielab/drchatgpt-health_prompting}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="koopman-zuccon-2023-dr">
<titleInfo>
<title>Dr ChatGPT tell me what I want to hear: How different prompts impact health answer correctness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bevan</namePart>
<namePart type="family">Koopman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guido</namePart>
<namePart type="family">Zuccon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper investigates the significant impact different prompts have on the behaviour of ChatGPT when used for health information seeking. As people more and more depend on generative large language models (LLMs) like ChatGPT, it is critical to understand model behaviour under different conditions, especially for domains where incorrect answers can have serious consequences such as health. Using the TREC Misinformation dataset, we empirically evaluate ChatGPT to show not just its effectiveness but reveal that knowledge passed in the prompt can bias the model to the detriment of answer correctness. We show this occurs both for retrieve-then-generate pipelines and based on how a user phrases their question as well as the question type. This work has important implications for the development of more robust and transparent question-answering systems based on generative large language models. Prompts, raw result files and manual analysis are made publicly available at https://github.com/ielab/drchatgpt-health_prompting.</abstract>
<identifier type="citekey">koopman-zuccon-2023-dr</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.928</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.928</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>15012</start>
<end>15022</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dr ChatGPT tell me what I want to hear: How different prompts impact health answer correctness
%A Koopman, Bevan
%A Zuccon, Guido
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F koopman-zuccon-2023-dr
%X This paper investigates the significant impact different prompts have on the behaviour of ChatGPT when used for health information seeking. As people more and more depend on generative large language models (LLMs) like ChatGPT, it is critical to understand model behaviour under different conditions, especially for domains where incorrect answers can have serious consequences such as health. Using the TREC Misinformation dataset, we empirically evaluate ChatGPT to show not just its effectiveness but reveal that knowledge passed in the prompt can bias the model to the detriment of answer correctness. We show this occurs both for retrieve-then-generate pipelines and based on how a user phrases their question as well as the question type. This work has important implications for the development of more robust and transparent question-answering systems based on generative large language models. Prompts, raw result files and manual analysis are made publicly available at https://github.com/ielab/drchatgpt-health_prompting.
%R 10.18653/v1/2023.emnlp-main.928
%U https://aclanthology.org/2023.emnlp-main.928
%U https://doi.org/10.18653/v1/2023.emnlp-main.928
%P 15012-15022
Markdown (Informal)
[Dr ChatGPT tell me what I want to hear: How different prompts impact health answer correctness](https://aclanthology.org/2023.emnlp-main.928) (Koopman & Zuccon, EMNLP 2023)
ACL