@inproceedings{pugachev-etal-2025-repa,
title = "{REPA}: {R}ussian Error Types Annotation for Evaluating Text Generation and Judgment Capabilities",
author = "Pugachev, Alexander and
Fenogenova, Alena and
Mikhailov, Vladislav and
Artemova, Ekaterina",
editor = "Piskorski, Jakub and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Nakov, Preslav and
Yangarber, Roman and
Marcinczuk, Michal",
booktitle = "Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bsnlp-1.16/",
doi = "10.18653/v1/2025.bsnlp-1.16",
pages = "136--150",
ISBN = "978-1-959429-57-9",
abstract = "Recent advances in large language models (LLMs) have introduced the novel paradigm of using LLMs as judges, where an LLM evaluates and scores the outputs of another LLM, often correlating highly with human preferences. However, the use of LLM-as-a-judge has been primarily studied in English. In this paper, we evaluate this framework in Russian by introducing the Russian Error tyPes Annotation dataset (REPA, (eng: turnip)), a dataset of 1,000 user queries and 2,000 LLM-generated responses. Human annotators labeled each response pair, expressing their preferences across ten specific error types, as well as selecting an overall preference. We rank six generative LLMs across the error types using three rating systems based on human preferences. We also evaluate responses using eight LLM judges in zero-shot and few-shot settings. We describe the results of analyzing the judges and position and length biases. Our findings reveal a notable gap between LLM judge performance in Russian and English. However, rankings based on human and LLM preferences show partial alignment, suggesting that while current LLM judges struggle with fine-grained evaluation in Russian, there is potential for improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pugachev-etal-2025-repa">
<titleInfo>
<title>REPA: Russian Error Types Annotation for Evaluating Text Generation and Judgment Capabilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Pugachev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alena</namePart>
<namePart type="family">Fenogenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Marcinczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-57-9</identifier>
</relatedItem>
<abstract>Recent advances in large language models (LLMs) have introduced the novel paradigm of using LLMs as judges, where an LLM evaluates and scores the outputs of another LLM, often correlating highly with human preferences. However, the use of LLM-as-a-judge has been primarily studied in English. In this paper, we evaluate this framework in Russian by introducing the Russian Error tyPes Annotation dataset (REPA, (eng: turnip)), a dataset of 1,000 user queries and 2,000 LLM-generated responses. Human annotators labeled each response pair, expressing their preferences across ten specific error types, as well as selecting an overall preference. We rank six generative LLMs across the error types using three rating systems based on human preferences. We also evaluate responses using eight LLM judges in zero-shot and few-shot settings. We describe the results of analyzing the judges and position and length biases. Our findings reveal a notable gap between LLM judge performance in Russian and English. However, rankings based on human and LLM preferences show partial alignment, suggesting that while current LLM judges struggle with fine-grained evaluation in Russian, there is potential for improvement.</abstract>
<identifier type="citekey">pugachev-etal-2025-repa</identifier>
<identifier type="doi">10.18653/v1/2025.bsnlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2025.bsnlp-1.16/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>136</start>
<end>150</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T REPA: Russian Error Types Annotation for Evaluating Text Generation and Judgment Capabilities
%A Pugachev, Alexander
%A Fenogenova, Alena
%A Mikhailov, Vladislav
%A Artemova, Ekaterina
%Y Piskorski, Jakub
%Y Přibáň, Pavel
%Y Nakov, Preslav
%Y Yangarber, Roman
%Y Marcinczuk, Michal
%S Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-57-9
%F pugachev-etal-2025-repa
%X Recent advances in large language models (LLMs) have introduced the novel paradigm of using LLMs as judges, where an LLM evaluates and scores the outputs of another LLM, often correlating highly with human preferences. However, the use of LLM-as-a-judge has been primarily studied in English. In this paper, we evaluate this framework in Russian by introducing the Russian Error tyPes Annotation dataset (REPA, (eng: turnip)), a dataset of 1,000 user queries and 2,000 LLM-generated responses. Human annotators labeled each response pair, expressing their preferences across ten specific error types, as well as selecting an overall preference. We rank six generative LLMs across the error types using three rating systems based on human preferences. We also evaluate responses using eight LLM judges in zero-shot and few-shot settings. We describe the results of analyzing the judges and position and length biases. Our findings reveal a notable gap between LLM judge performance in Russian and English. However, rankings based on human and LLM preferences show partial alignment, suggesting that while current LLM judges struggle with fine-grained evaluation in Russian, there is potential for improvement.
%R 10.18653/v1/2025.bsnlp-1.16
%U https://aclanthology.org/2025.bsnlp-1.16/
%U https://doi.org/10.18653/v1/2025.bsnlp-1.16
%P 136-150
Markdown (Informal)
[REPA: Russian Error Types Annotation for Evaluating Text Generation and Judgment Capabilities](https://aclanthology.org/2025.bsnlp-1.16/) (Pugachev et al., BSNLP 2025)
ACL