@inproceedings{leang-etal-2026-picsar,
title = "{P}i{CSAR}: Probabilistic Confidence Selection and Ranking for Reasoning Chains",
author = "Leang, Joshua Ong Jun and
Zhao, Zheng and
Gema, Aryo Pradipta and
Yang, Sohee and
Kwan, Wai-Chung and
He, Xuanli and
Li, Wenda and
Minervini, Pasquale and
Giunchiglia, Eleonora and
Cohen, Shay B",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1577/",
pages = "31511--31544",
ISBN = "979-8-89176-395-1",
abstract = "Best-of-$n$ sampling improves the accuracy of large language models (LLMs) and large reasoning models (LRMs) by generating multiple candidate solutions and selecting the one with the highest reward. The key challenge for reasoning tasks is designing a scoring function that can identify correct reasoning chains without access to ground-truth answers. We propose Probabilistic Confidence Selection and Ranking for Reasoning Chains (PiCSAR): a simple, training-free method that scores each candidate generation using the joint log-likelihood of the reasoning and final answer. This method utilises both the scores of the reasoning path (*reasoning confidence*) and the final answer (*answer confidence*). PiCSAR achieves substantial gains across several benchmarks ($+11.7$ on AIME2024, $+9.81$ on AIME2025), outperforming baselines with at least 2x fewer samples in 20 out of 25 comparisons. Our analysis reveals that correct reasoning chains exhibit higher reasoning and answer confidence, justifying the effectiveness of PiCSAR."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leang-etal-2026-picsar">
<titleInfo>
<title>PiCSAR: Probabilistic Confidence Selection and Ranking for Reasoning Chains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="given">Ong</namePart>
<namePart type="given">Jun</namePart>
<namePart type="family">Leang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aryo</namePart>
<namePart type="given">Pradipta</namePart>
<namePart type="family">Gema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sohee</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wai-Chung</namePart>
<namePart type="family">Kwan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanli</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenda</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pasquale</namePart>
<namePart type="family">Minervini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleonora</namePart>
<namePart type="family">Giunchiglia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shay</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Best-of-n sampling improves the accuracy of large language models (LLMs) and large reasoning models (LRMs) by generating multiple candidate solutions and selecting the one with the highest reward. The key challenge for reasoning tasks is designing a scoring function that can identify correct reasoning chains without access to ground-truth answers. We propose Probabilistic Confidence Selection and Ranking for Reasoning Chains (PiCSAR): a simple, training-free method that scores each candidate generation using the joint log-likelihood of the reasoning and final answer. This method utilises both the scores of the reasoning path (*reasoning confidence*) and the final answer (*answer confidence*). PiCSAR achieves substantial gains across several benchmarks (+11.7 on AIME2024, +9.81 on AIME2025), outperforming baselines with at least 2x fewer samples in 20 out of 25 comparisons. Our analysis reveals that correct reasoning chains exhibit higher reasoning and answer confidence, justifying the effectiveness of PiCSAR.</abstract>
<identifier type="citekey">leang-etal-2026-picsar</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1577/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>31511</start>
<end>31544</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PiCSAR: Probabilistic Confidence Selection and Ranking for Reasoning Chains
%A Leang, Joshua Ong Jun
%A Zhao, Zheng
%A Gema, Aryo Pradipta
%A Yang, Sohee
%A Kwan, Wai-Chung
%A He, Xuanli
%A Li, Wenda
%A Minervini, Pasquale
%A Giunchiglia, Eleonora
%A Cohen, Shay B.
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F leang-etal-2026-picsar
%X Best-of-n sampling improves the accuracy of large language models (LLMs) and large reasoning models (LRMs) by generating multiple candidate solutions and selecting the one with the highest reward. The key challenge for reasoning tasks is designing a scoring function that can identify correct reasoning chains without access to ground-truth answers. We propose Probabilistic Confidence Selection and Ranking for Reasoning Chains (PiCSAR): a simple, training-free method that scores each candidate generation using the joint log-likelihood of the reasoning and final answer. This method utilises both the scores of the reasoning path (*reasoning confidence*) and the final answer (*answer confidence*). PiCSAR achieves substantial gains across several benchmarks (+11.7 on AIME2024, +9.81 on AIME2025), outperforming baselines with at least 2x fewer samples in 20 out of 25 comparisons. Our analysis reveals that correct reasoning chains exhibit higher reasoning and answer confidence, justifying the effectiveness of PiCSAR.
%U https://aclanthology.org/2026.findings-acl.1577/
%P 31511-31544
Markdown (Informal)
[PiCSAR: Probabilistic Confidence Selection and Ranking for Reasoning Chains](https://aclanthology.org/2026.findings-acl.1577/) (Leang et al., Findings 2026)
ACL
- Joshua Ong Jun Leang, Zheng Zhao, Aryo Pradipta Gema, Sohee Yang, Wai-Chung Kwan, Xuanli He, Wenda Li, Pasquale Minervini, Eleonora Giunchiglia, and Shay B Cohen. 2026. PiCSAR: Probabilistic Confidence Selection and Ranking for Reasoning Chains. In Findings of the Association for Computational Linguistics: ACL 2026, pages 31511–31544, San Diego, California, United States. Association for Computational Linguistics.