@inproceedings{hamotskyi-etal-2024-eval,
title = "Eval-{UA}-tion 1.0: Benchmark for Evaluating {U}krainian (Large) Language Models",
author = {Hamotskyi, Serhii and
Levbarg, Anna-Izabella and
H{\"a}nig, Christian},
editor = "Romanyshyn, Mariana and
Romanyshyn, Nataliia and
Hlybovets, Andrii and
Ignatenko, Oleksii",
booktitle = "Proceedings of the Third Ukrainian Natural Language Processing Workshop (UNLP) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.unlp-1.13",
pages = "109--119",
abstract = "In this paper, we introduce Eval-UA-tion, a set of novel Ukrainian-language datasets aimed at evaluating the performance of language models on the Ukrainian language. The tasks include UA-CBT (inspired by the Children{'}s Book Test, a fill-in-the-gaps type task aimed at gauging the extent to which a story narrative is understood), UP-Titles (where the online newspaper \textit{Ukrainska Pravda}{`}s articles have to be matched to the correct title among 10 similar ones), and LMentry-static-UA/LMES (inspired by the LMentry benchmark, a set of tasks simple to solve for humans but hard for LMs, such as {`}which of these words is longer{'} and {`}what is the fifth word of this sentence{'}). With the exception of UP-Titles, the tasks are built in a way to minimize contamination and use material unlikely to be present in the training sets of language models, and include a split for few-shot model prompting use that minimizes contamination. For each task human and random baselines are provided.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hamotskyi-etal-2024-eval">
<titleInfo>
<title>Eval-UA-tion 1.0: Benchmark for Evaluating Ukrainian (Large) Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Serhii</namePart>
<namePart type="family">Hamotskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna-Izabella</namePart>
<namePart type="family">Levbarg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Hänig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Ukrainian Natural Language Processing Workshop (UNLP) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nataliia</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrii</namePart>
<namePart type="family">Hlybovets</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleksii</namePart>
<namePart type="family">Ignatenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce Eval-UA-tion, a set of novel Ukrainian-language datasets aimed at evaluating the performance of language models on the Ukrainian language. The tasks include UA-CBT (inspired by the Children’s Book Test, a fill-in-the-gaps type task aimed at gauging the extent to which a story narrative is understood), UP-Titles (where the online newspaper Ukrainska Pravda‘s articles have to be matched to the correct title among 10 similar ones), and LMentry-static-UA/LMES (inspired by the LMentry benchmark, a set of tasks simple to solve for humans but hard for LMs, such as ‘which of these words is longer’ and ‘what is the fifth word of this sentence’). With the exception of UP-Titles, the tasks are built in a way to minimize contamination and use material unlikely to be present in the training sets of language models, and include a split for few-shot model prompting use that minimizes contamination. For each task human and random baselines are provided.</abstract>
<identifier type="citekey">hamotskyi-etal-2024-eval</identifier>
<location>
<url>https://aclanthology.org/2024.unlp-1.13</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>109</start>
<end>119</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Eval-UA-tion 1.0: Benchmark for Evaluating Ukrainian (Large) Language Models
%A Hamotskyi, Serhii
%A Levbarg, Anna-Izabella
%A Hänig, Christian
%Y Romanyshyn, Mariana
%Y Romanyshyn, Nataliia
%Y Hlybovets, Andrii
%Y Ignatenko, Oleksii
%S Proceedings of the Third Ukrainian Natural Language Processing Workshop (UNLP) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F hamotskyi-etal-2024-eval
%X In this paper, we introduce Eval-UA-tion, a set of novel Ukrainian-language datasets aimed at evaluating the performance of language models on the Ukrainian language. The tasks include UA-CBT (inspired by the Children’s Book Test, a fill-in-the-gaps type task aimed at gauging the extent to which a story narrative is understood), UP-Titles (where the online newspaper Ukrainska Pravda‘s articles have to be matched to the correct title among 10 similar ones), and LMentry-static-UA/LMES (inspired by the LMentry benchmark, a set of tasks simple to solve for humans but hard for LMs, such as ‘which of these words is longer’ and ‘what is the fifth word of this sentence’). With the exception of UP-Titles, the tasks are built in a way to minimize contamination and use material unlikely to be present in the training sets of language models, and include a split for few-shot model prompting use that minimizes contamination. For each task human and random baselines are provided.
%U https://aclanthology.org/2024.unlp-1.13
%P 109-119
Markdown (Informal)
[Eval-UA-tion 1.0: Benchmark for Evaluating Ukrainian (Large) Language Models](https://aclanthology.org/2024.unlp-1.13) (Hamotskyi et al., UNLP 2024)
ACL