@inproceedings{tonolini-etal-2024-bayesian,
title = "{B}ayesian Prompt Ensembles: Model Uncertainty Estimation for Black-Box Large Language Models",
author = "Tonolini, Francesco and
Aletras, Nikolaos and
Massiah, Jordan and
Kazai, Gabriella",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.728/",
doi = "10.18653/v1/2024.findings-acl.728",
pages = "12229--12272",
abstract = "An important requirement for the reliable deployment of pre-trained large language models (LLMs) is the well-calibrated quantification of the uncertainty in their outputs. While the likelihood of predicting the next token is a practical surrogate of the data uncertainty learned during training, model uncertainty is challenging to estimate, i.e., due to lack of knowledge acquired during training. Prior efforts to quantify uncertainty of neural networks require specific architectures or (re-)training strategies, which are impractical to apply to LLMs with several billion parameters, or for black-box models where the architecture and parameters are not available. In this paper, we propose Bayesian Prompts Ensembles (BayesPE), a novel approach to effectively obtain well-calibrated uncertainty for the output of pre-trained LLMs. BayesPE computes output probabilities through a weighted ensemble of different, but semantically equivalent, task instruction prompts. The relative weights of the different prompts in the ensemble are estimated through approximate Bayesian variational inference over a small labeled validation set. We demonstrate that BayesPE approximates a Bayesian input layer for the LLM, providing a lower bound on the expected model error. In our extensive experiments, we show that BayesPE achieves significantly superior uncertainty calibration compared to several baselines over a range of natural language classification tasks, both in zero- and few-shot settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tonolini-etal-2024-bayesian">
<titleInfo>
<title>Bayesian Prompt Ensembles: Model Uncertainty Estimation for Black-Box Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Tonolini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Massiah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriella</namePart>
<namePart type="family">Kazai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>An important requirement for the reliable deployment of pre-trained large language models (LLMs) is the well-calibrated quantification of the uncertainty in their outputs. While the likelihood of predicting the next token is a practical surrogate of the data uncertainty learned during training, model uncertainty is challenging to estimate, i.e., due to lack of knowledge acquired during training. Prior efforts to quantify uncertainty of neural networks require specific architectures or (re-)training strategies, which are impractical to apply to LLMs with several billion parameters, or for black-box models where the architecture and parameters are not available. In this paper, we propose Bayesian Prompts Ensembles (BayesPE), a novel approach to effectively obtain well-calibrated uncertainty for the output of pre-trained LLMs. BayesPE computes output probabilities through a weighted ensemble of different, but semantically equivalent, task instruction prompts. The relative weights of the different prompts in the ensemble are estimated through approximate Bayesian variational inference over a small labeled validation set. We demonstrate that BayesPE approximates a Bayesian input layer for the LLM, providing a lower bound on the expected model error. In our extensive experiments, we show that BayesPE achieves significantly superior uncertainty calibration compared to several baselines over a range of natural language classification tasks, both in zero- and few-shot settings.</abstract>
<identifier type="citekey">tonolini-etal-2024-bayesian</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.728</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.728/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>12229</start>
<end>12272</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bayesian Prompt Ensembles: Model Uncertainty Estimation for Black-Box Large Language Models
%A Tonolini, Francesco
%A Aletras, Nikolaos
%A Massiah, Jordan
%A Kazai, Gabriella
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F tonolini-etal-2024-bayesian
%X An important requirement for the reliable deployment of pre-trained large language models (LLMs) is the well-calibrated quantification of the uncertainty in their outputs. While the likelihood of predicting the next token is a practical surrogate of the data uncertainty learned during training, model uncertainty is challenging to estimate, i.e., due to lack of knowledge acquired during training. Prior efforts to quantify uncertainty of neural networks require specific architectures or (re-)training strategies, which are impractical to apply to LLMs with several billion parameters, or for black-box models where the architecture and parameters are not available. In this paper, we propose Bayesian Prompts Ensembles (BayesPE), a novel approach to effectively obtain well-calibrated uncertainty for the output of pre-trained LLMs. BayesPE computes output probabilities through a weighted ensemble of different, but semantically equivalent, task instruction prompts. The relative weights of the different prompts in the ensemble are estimated through approximate Bayesian variational inference over a small labeled validation set. We demonstrate that BayesPE approximates a Bayesian input layer for the LLM, providing a lower bound on the expected model error. In our extensive experiments, we show that BayesPE achieves significantly superior uncertainty calibration compared to several baselines over a range of natural language classification tasks, both in zero- and few-shot settings.
%R 10.18653/v1/2024.findings-acl.728
%U https://aclanthology.org/2024.findings-acl.728/
%U https://doi.org/10.18653/v1/2024.findings-acl.728
%P 12229-12272
Markdown (Informal)
[Bayesian Prompt Ensembles: Model Uncertainty Estimation for Black-Box Large Language Models](https://aclanthology.org/2024.findings-acl.728/) (Tonolini et al., Findings 2024)
ACL