@inproceedings{pena-gnecco-etal-2026-verbanexai,
title = "{V}erba{N}ex{AI} at {S}em{E}val-2026 Task 5: Few-Shot Chain-of-Thought with Selective Self-Consistency and Isotonic Calibration for Word Sense Plausibility Rating",
author = "Pe{\~n}a Gnecco, Daniel and
Puertas, Edwin and
Martinez Santos, Juan Carlos and
Serrano, Jairo",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.190/",
pages = "1469--1476",
ISBN = "979-8-89176-414-9",
abstract = "We present a system for rating word sense plausibility in ambiguous narrative contexts for SemEval-2026 Task 5. Our approach ensembles three large language models (Llama-3.1 70B, Qwen-2.5 32B, and Gemma-2 27B) using a computationally efficient, uncertainty-aware pipeline. We combine few-shot chain-of-thought prompting with selective self-consistency, which applies stochastic multiple sampling exclusively to items identified as inherently ambiguous. This targeted strategy reduces inference costs by approximately 45{\%} while maintaining robustness in predictions. To correct the systematic bias of LLMs toward extreme ratings, we apply isotonic regression to shift the output distribution toward patterns of human judgment. Our system achieves a Spearman correlation of 0.67 and an accuracy within 0.76 standard deviations, ranking 34th out of 79 participating teams (top 43{\%} without task-specific fine-tuning). Detailed error analysis reveals that while our system performs strongly on clear contexts ({\ensuremath{\rho}} = 0.78), current prompting paradigms struggle significantly to model multimodal human disagreement in genuinely ambiguous cases ({\ensuremath{\rho}} = 0.58), highlighting an important challenge for future work on subjective semantic tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pena-gnecco-etal-2026-verbanexai">
<titleInfo>
<title>VerbaNexAI at SemEval-2026 Task 5: Few-Shot Chain-of-Thought with Selective Self-Consistency and Isotonic Calibration for Word Sense Plausibility Rating</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Peña Gnecco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edwin</namePart>
<namePart type="family">Puertas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Carlos</namePart>
<namePart type="family">Martinez Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jairo</namePart>
<namePart type="family">Serrano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>We present a system for rating word sense plausibility in ambiguous narrative contexts for SemEval-2026 Task 5. Our approach ensembles three large language models (Llama-3.1 70B, Qwen-2.5 32B, and Gemma-2 27B) using a computationally efficient, uncertainty-aware pipeline. We combine few-shot chain-of-thought prompting with selective self-consistency, which applies stochastic multiple sampling exclusively to items identified as inherently ambiguous. This targeted strategy reduces inference costs by approximately 45% while maintaining robustness in predictions. To correct the systematic bias of LLMs toward extreme ratings, we apply isotonic regression to shift the output distribution toward patterns of human judgment. Our system achieves a Spearman correlation of 0.67 and an accuracy within 0.76 standard deviations, ranking 34th out of 79 participating teams (top 43% without task-specific fine-tuning). Detailed error analysis reveals that while our system performs strongly on clear contexts (\ensuremathρ = 0.78), current prompting paradigms struggle significantly to model multimodal human disagreement in genuinely ambiguous cases (\ensuremathρ = 0.58), highlighting an important challenge for future work on subjective semantic tasks.</abstract>
<identifier type="citekey">pena-gnecco-etal-2026-verbanexai</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.190/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1469</start>
<end>1476</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VerbaNexAI at SemEval-2026 Task 5: Few-Shot Chain-of-Thought with Selective Self-Consistency and Isotonic Calibration for Word Sense Plausibility Rating
%A Peña Gnecco, Daniel
%A Puertas, Edwin
%A Martinez Santos, Juan Carlos
%A Serrano, Jairo
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F pena-gnecco-etal-2026-verbanexai
%X We present a system for rating word sense plausibility in ambiguous narrative contexts for SemEval-2026 Task 5. Our approach ensembles three large language models (Llama-3.1 70B, Qwen-2.5 32B, and Gemma-2 27B) using a computationally efficient, uncertainty-aware pipeline. We combine few-shot chain-of-thought prompting with selective self-consistency, which applies stochastic multiple sampling exclusively to items identified as inherently ambiguous. This targeted strategy reduces inference costs by approximately 45% while maintaining robustness in predictions. To correct the systematic bias of LLMs toward extreme ratings, we apply isotonic regression to shift the output distribution toward patterns of human judgment. Our system achieves a Spearman correlation of 0.67 and an accuracy within 0.76 standard deviations, ranking 34th out of 79 participating teams (top 43% without task-specific fine-tuning). Detailed error analysis reveals that while our system performs strongly on clear contexts (\ensuremathρ = 0.78), current prompting paradigms struggle significantly to model multimodal human disagreement in genuinely ambiguous cases (\ensuremathρ = 0.58), highlighting an important challenge for future work on subjective semantic tasks.
%U https://aclanthology.org/2026.semeval-1.190/
%P 1469-1476
Markdown (Informal)
[VerbaNexAI at SemEval-2026 Task 5: Few-Shot Chain-of-Thought with Selective Self-Consistency and Isotonic Calibration for Word Sense Plausibility Rating](https://aclanthology.org/2026.semeval-1.190/) (Peña Gnecco et al., SemEval 2026)
ACL