@inproceedings{bago-bakaric-2025-shot,
title = "Few-Shot Prompting, Full-Scale Confusion: Evaluating Large Language Models for Humor Detection in {C}roatian Tweets",
author = "Bago, Petra and
Bakari{\'c}, Nikola",
editor = "Piskorski, Jakub and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Nakov, Preslav and
Yangarber, Roman and
Marcinczuk, Michal",
booktitle = "Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bsnlp-1.2/",
doi = "10.18653/v1/2025.bsnlp-1.2",
pages = "9--16",
ISBN = "978-1-959429-57-9",
abstract = "Humor detection in low-resource languages is hampered by cultural nuance and subjective annotation. We test two large language models, GPT-4 and Gemini 2.5 Flash, on labeling humor in 6,000 Croatian tweets with expert gold labels generated through a rigorous annotation pipeline. LLM{--}human agreement ({\ensuremath{\kappa}} = 0.28) matches human{--}human agreement ({\ensuremath{\kappa}} = 0.27), while LLM{--}LLM agreement is substantially higher ({\ensuremath{\kappa}} = 0.63). Although concordance with expert adjudication is lower, additional metrics imply that the models equal a second human annotator while working far faster and at negligible cost. These findings suggest, even with simple prompting, LLMs can efficiently bootstrap subjective datasets and serve as practical annotation assistants in linguistically under-represented settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bago-bakaric-2025-shot">
<titleInfo>
<title>Few-Shot Prompting, Full-Scale Confusion: Evaluating Large Language Models for Humor Detection in Croatian Tweets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Petra</namePart>
<namePart type="family">Bago</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Bakarić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Marcinczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-57-9</identifier>
</relatedItem>
<abstract>Humor detection in low-resource languages is hampered by cultural nuance and subjective annotation. We test two large language models, GPT-4 and Gemini 2.5 Flash, on labeling humor in 6,000 Croatian tweets with expert gold labels generated through a rigorous annotation pipeline. LLM–human agreement (\ensuremathąppa = 0.28) matches human–human agreement (\ensuremathąppa = 0.27), while LLM–LLM agreement is substantially higher (\ensuremathąppa = 0.63). Although concordance with expert adjudication is lower, additional metrics imply that the models equal a second human annotator while working far faster and at negligible cost. These findings suggest, even with simple prompting, LLMs can efficiently bootstrap subjective datasets and serve as practical annotation assistants in linguistically under-represented settings.</abstract>
<identifier type="citekey">bago-bakaric-2025-shot</identifier>
<identifier type="doi">10.18653/v1/2025.bsnlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2025.bsnlp-1.2/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>9</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Few-Shot Prompting, Full-Scale Confusion: Evaluating Large Language Models for Humor Detection in Croatian Tweets
%A Bago, Petra
%A Bakarić, Nikola
%Y Piskorski, Jakub
%Y Přibáň, Pavel
%Y Nakov, Preslav
%Y Yangarber, Roman
%Y Marcinczuk, Michal
%S Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-57-9
%F bago-bakaric-2025-shot
%X Humor detection in low-resource languages is hampered by cultural nuance and subjective annotation. We test two large language models, GPT-4 and Gemini 2.5 Flash, on labeling humor in 6,000 Croatian tweets with expert gold labels generated through a rigorous annotation pipeline. LLM–human agreement (\ensuremathąppa = 0.28) matches human–human agreement (\ensuremathąppa = 0.27), while LLM–LLM agreement is substantially higher (\ensuremathąppa = 0.63). Although concordance with expert adjudication is lower, additional metrics imply that the models equal a second human annotator while working far faster and at negligible cost. These findings suggest, even with simple prompting, LLMs can efficiently bootstrap subjective datasets and serve as practical annotation assistants in linguistically under-represented settings.
%R 10.18653/v1/2025.bsnlp-1.2
%U https://aclanthology.org/2025.bsnlp-1.2/
%U https://doi.org/10.18653/v1/2025.bsnlp-1.2
%P 9-16
Markdown (Informal)
[Few-Shot Prompting, Full-Scale Confusion: Evaluating Large Language Models for Humor Detection in Croatian Tweets](https://aclanthology.org/2025.bsnlp-1.2/) (Bago & Bakarić, BSNLP 2025)
ACL