@inproceedings{ortega-riba-etal-2026-cuclasic,
title = "{CUCLASIC} at {S}em{E}val-2026 Task 5: {LLM} Prompting Strategies for Rating Ambiguous Word Senses",
author = "Ortega Riba, Federico and
Wilkerson, Jasper and
Lafreniere Adams, Kelsey",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.121/",
pages = "886--893",
ISBN = "979-8-89176-414-9",
abstract = "Word sense disambiguation has been a foundational task in computational semantics since the 1990s, but remains an unsolved problem when it comes to bridging human and computational evaluation of ambiguity. The SemEval-2026 Task 5 attempts to address this gap. We test six Large Language Models (LLMs) from the Llama and Gemini families in order to evaluate LLMs' ratings of ambiguous textual excerpts, experimenting with zero- and few-shot variants of prompts and analyzing how simple linguistic cues improve performance. We propose a methodology of eliciting human-like ratings from language models by using examples with low and high standard deviations between human ratings. We further evaluate and compare the prediction patterns of different models and how they align with the human generated ratings. Our best model (Gemini 3-Flash) achieves a 75{\%} score combining Spearman correlation and accuracy within one standard deviation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ortega-riba-etal-2026-cuclasic">
<titleInfo>
<title>CUCLASIC at SemEval-2026 Task 5: LLM Prompting Strategies for Rating Ambiguous Word Senses</title>
</titleInfo>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Ortega Riba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasper</namePart>
<namePart type="family">Wilkerson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelsey</namePart>
<namePart type="family">Lafreniere Adams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>Word sense disambiguation has been a foundational task in computational semantics since the 1990s, but remains an unsolved problem when it comes to bridging human and computational evaluation of ambiguity. The SemEval-2026 Task 5 attempts to address this gap. We test six Large Language Models (LLMs) from the Llama and Gemini families in order to evaluate LLMs’ ratings of ambiguous textual excerpts, experimenting with zero- and few-shot variants of prompts and analyzing how simple linguistic cues improve performance. We propose a methodology of eliciting human-like ratings from language models by using examples with low and high standard deviations between human ratings. We further evaluate and compare the prediction patterns of different models and how they align with the human generated ratings. Our best model (Gemini 3-Flash) achieves a 75% score combining Spearman correlation and accuracy within one standard deviation.</abstract>
<identifier type="citekey">ortega-riba-etal-2026-cuclasic</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.121/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>886</start>
<end>893</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CUCLASIC at SemEval-2026 Task 5: LLM Prompting Strategies for Rating Ambiguous Word Senses
%A Ortega Riba, Federico
%A Wilkerson, Jasper
%A Lafreniere Adams, Kelsey
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F ortega-riba-etal-2026-cuclasic
%X Word sense disambiguation has been a foundational task in computational semantics since the 1990s, but remains an unsolved problem when it comes to bridging human and computational evaluation of ambiguity. The SemEval-2026 Task 5 attempts to address this gap. We test six Large Language Models (LLMs) from the Llama and Gemini families in order to evaluate LLMs’ ratings of ambiguous textual excerpts, experimenting with zero- and few-shot variants of prompts and analyzing how simple linguistic cues improve performance. We propose a methodology of eliciting human-like ratings from language models by using examples with low and high standard deviations between human ratings. We further evaluate and compare the prediction patterns of different models and how they align with the human generated ratings. Our best model (Gemini 3-Flash) achieves a 75% score combining Spearman correlation and accuracy within one standard deviation.
%U https://aclanthology.org/2026.semeval-1.121/
%P 886-893
Markdown (Informal)
[CUCLASIC at SemEval-2026 Task 5: LLM Prompting Strategies for Rating Ambiguous Word Senses](https://aclanthology.org/2026.semeval-1.121/) (Ortega Riba et al., SemEval 2026)
ACL