@inproceedings{strozyna-etal-2026-evaluating,
title = "Evaluating Multilingual Sentiment Classifiers Using an {LLM}-Annotated {W}ikipedia Benchmark",
author = "Str{\'o}{\.z}yna, Milena and
Lewoniewski, W{\l}odzimierz and
Czuma{\l}owska, Izabela",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.63/",
pages = "692--703",
ISBN = "979-8-89176-423-1",
abstract = "We present a multilingual study of sentiment evaluation on Wikipedia articles from various topics in five languages (German, English,Spanish, Polish, and Russian). In this paper, we compare three large language models (Gemini Pro 3.1, Claude Opus 4.6, and GPT 5.2),each queried three times per sentence, with two popular multilingual sentiment classifiers. This setup allows us to analyze not only inter-model differences but also intra-model stability as a proxy for confidence.To support systematic evaluation, we construct a benchmark dataset based on strict consensus across evaluators and analyze sentiment distributions across topics and languages. We show substantial variation in sentiment distributions, agreement, and consistency across models and languages. Our results suggest that sentiment evaluation on encyclopedic text remains an underexplored challenge for multilingual NLP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="strozyna-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Multilingual Sentiment Classifiers Using an LLM-Annotated Wikipedia Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Milena</namePart>
<namePart type="family">Stróżyna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Włodzimierz</namePart>
<namePart type="family">Lewoniewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Izabela</namePart>
<namePart type="family">Czumałowska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>We present a multilingual study of sentiment evaluation on Wikipedia articles from various topics in five languages (German, English,Spanish, Polish, and Russian). In this paper, we compare three large language models (Gemini Pro 3.1, Claude Opus 4.6, and GPT 5.2),each queried three times per sentence, with two popular multilingual sentiment classifiers. This setup allows us to analyze not only inter-model differences but also intra-model stability as a proxy for confidence.To support systematic evaluation, we construct a benchmark dataset based on strict consensus across evaluators and analyze sentiment distributions across topics and languages. We show substantial variation in sentiment distributions, agreement, and consistency across models and languages. Our results suggest that sentiment evaluation on encyclopedic text remains an underexplored challenge for multilingual NLP.</abstract>
<identifier type="citekey">strozyna-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.63/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>692</start>
<end>703</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Multilingual Sentiment Classifiers Using an LLM-Annotated Wikipedia Benchmark
%A Stróżyna, Milena
%A Lewoniewski, Włodzimierz
%A Czumałowska, Izabela
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F strozyna-etal-2026-evaluating
%X We present a multilingual study of sentiment evaluation on Wikipedia articles from various topics in five languages (German, English,Spanish, Polish, and Russian). In this paper, we compare three large language models (Gemini Pro 3.1, Claude Opus 4.6, and GPT 5.2),each queried three times per sentence, with two popular multilingual sentiment classifiers. This setup allows us to analyze not only inter-model differences but also intra-model stability as a proxy for confidence.To support systematic evaluation, we construct a benchmark dataset based on strict consensus across evaluators and analyze sentiment distributions across topics and languages. We show substantial variation in sentiment distributions, agreement, and consistency across models and languages. Our results suggest that sentiment evaluation on encyclopedic text remains an underexplored challenge for multilingual NLP.
%U https://aclanthology.org/2026.gem-main.63/
%P 692-703
Markdown (Informal)
[Evaluating Multilingual Sentiment Classifiers Using an LLM-Annotated Wikipedia Benchmark](https://aclanthology.org/2026.gem-main.63/) (Stróżyna et al., GEM 2026)
ACL