@inproceedings{klemen-etal-2024-si,
title = "{SI}-{NLI}: A {S}lovene Natural Language Inference Dataset and Its Evaluation",
author = "Klemen, Matej and
{\v{Z}}agar, Ale{\v{s}} and
{\v{C}}ibej, Jaka and
Robnik-{\v{S}}ikonja, Marko",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1294",
pages = "14859--14870",
abstract = "Natural language inference (NLI) is an important language understanding benchmark. Two deficiencies of this benchmark are: i) most existing NLI datasets exist for English and a few other well-resourced languages, and ii) most NLI datasets are formed with a narrow set of annotators{'} instructions, allowing the prediction models to capture linguistic clues instead of measuring true reasoning capability. We address both issues and introduce SI-NLI, the first dataset for Slovene natural language inference. The dataset is constructed from scratch using knowledgeable annotators with carefully crafted guidelines aiming to avoid commonly encountered problems in existing NLI datasets. We also manually translate the SI-NLI to English to enable cross-lingual model training and evaluation. Using the newly created dataset and its translation, we train and evaluate a variety of large transformer language models in a monolingual and cross-lingual setting. The results indicate that larger models, in general, achieve better performance. The qualitative analysis shows that the SI-NLI dataset is diverse and that there remains plenty of room for improvement even for the largest models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="klemen-etal-2024-si">
<titleInfo>
<title>SI-NLI: A Slovene Natural Language Inference Dataset and Its Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matej</namePart>
<namePart type="family">Klemen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleš</namePart>
<namePart type="family">Žagar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaka</namePart>
<namePart type="family">Čibej</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Robnik-Šikonja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural language inference (NLI) is an important language understanding benchmark. Two deficiencies of this benchmark are: i) most existing NLI datasets exist for English and a few other well-resourced languages, and ii) most NLI datasets are formed with a narrow set of annotators’ instructions, allowing the prediction models to capture linguistic clues instead of measuring true reasoning capability. We address both issues and introduce SI-NLI, the first dataset for Slovene natural language inference. The dataset is constructed from scratch using knowledgeable annotators with carefully crafted guidelines aiming to avoid commonly encountered problems in existing NLI datasets. We also manually translate the SI-NLI to English to enable cross-lingual model training and evaluation. Using the newly created dataset and its translation, we train and evaluate a variety of large transformer language models in a monolingual and cross-lingual setting. The results indicate that larger models, in general, achieve better performance. The qualitative analysis shows that the SI-NLI dataset is diverse and that there remains plenty of room for improvement even for the largest models.</abstract>
<identifier type="citekey">klemen-etal-2024-si</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1294</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>14859</start>
<end>14870</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SI-NLI: A Slovene Natural Language Inference Dataset and Its Evaluation
%A Klemen, Matej
%A Žagar, Aleš
%A Čibej, Jaka
%A Robnik-Šikonja, Marko
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F klemen-etal-2024-si
%X Natural language inference (NLI) is an important language understanding benchmark. Two deficiencies of this benchmark are: i) most existing NLI datasets exist for English and a few other well-resourced languages, and ii) most NLI datasets are formed with a narrow set of annotators’ instructions, allowing the prediction models to capture linguistic clues instead of measuring true reasoning capability. We address both issues and introduce SI-NLI, the first dataset for Slovene natural language inference. The dataset is constructed from scratch using knowledgeable annotators with carefully crafted guidelines aiming to avoid commonly encountered problems in existing NLI datasets. We also manually translate the SI-NLI to English to enable cross-lingual model training and evaluation. Using the newly created dataset and its translation, we train and evaluate a variety of large transformer language models in a monolingual and cross-lingual setting. The results indicate that larger models, in general, achieve better performance. The qualitative analysis shows that the SI-NLI dataset is diverse and that there remains plenty of room for improvement even for the largest models.
%U https://aclanthology.org/2024.lrec-main.1294
%P 14859-14870
Markdown (Informal)
[SI-NLI: A Slovene Natural Language Inference Dataset and Its Evaluation](https://aclanthology.org/2024.lrec-main.1294) (Klemen et al., LREC-COLING 2024)
ACL