@inproceedings{obadic-etal-2023-c,
title = "{C}-{XNLI}: {C}roatian Extension of {XNLI} Dataset",
author = "Obadi{\'c}, Leo and
Jertec, Andrej and
Rajnovi{\'c}, Marko and
Dropulji{\'c}, Branimir",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.142",
doi = "10.18653/v1/2023.findings-acl.142",
pages = "2258--2267",
abstract = "Comprehensive multilingual evaluations have been encouraged by emerging cross-lingual benchmarks and constrained by existing parallel datasets. To partially mitigate this limitation, we extended the Cross-lingual Natural Language Inference (XNLI) corpus with Croatian. The development and test sets were translated by a professional translator, and we show that Croatian is consistent with other XNLI dubs. The train set is translated using Facebook{'}s 1.2B parameter m2m{\_}100 model. We thoroughly analyze the Croatian train set and compare its quality with the existing machine-translated German set. The comparison is based on 2000 manually scored sentences per language using a variant of the Direct Assessment (DA) score commonly used at the Conference on Machine Translation (WMT). Our findings reveal that a less-resourced language like Croatian is still lacking in translation quality of longer sentences compared to German. However, both sets have a substantial amount of poor quality translations, which should be considered in translation-based training or evaluation setups.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="obadic-etal-2023-c">
<titleInfo>
<title>C-XNLI: Croatian Extension of XNLI Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Obadić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrej</namePart>
<namePart type="family">Jertec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Rajnović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Branimir</namePart>
<namePart type="family">Dropuljić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Comprehensive multilingual evaluations have been encouraged by emerging cross-lingual benchmarks and constrained by existing parallel datasets. To partially mitigate this limitation, we extended the Cross-lingual Natural Language Inference (XNLI) corpus with Croatian. The development and test sets were translated by a professional translator, and we show that Croatian is consistent with other XNLI dubs. The train set is translated using Facebook’s 1.2B parameter m2m_100 model. We thoroughly analyze the Croatian train set and compare its quality with the existing machine-translated German set. The comparison is based on 2000 manually scored sentences per language using a variant of the Direct Assessment (DA) score commonly used at the Conference on Machine Translation (WMT). Our findings reveal that a less-resourced language like Croatian is still lacking in translation quality of longer sentences compared to German. However, both sets have a substantial amount of poor quality translations, which should be considered in translation-based training or evaluation setups.</abstract>
<identifier type="citekey">obadic-etal-2023-c</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.142</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.142</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>2258</start>
<end>2267</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T C-XNLI: Croatian Extension of XNLI Dataset
%A Obadić, Leo
%A Jertec, Andrej
%A Rajnović, Marko
%A Dropuljić, Branimir
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F obadic-etal-2023-c
%X Comprehensive multilingual evaluations have been encouraged by emerging cross-lingual benchmarks and constrained by existing parallel datasets. To partially mitigate this limitation, we extended the Cross-lingual Natural Language Inference (XNLI) corpus with Croatian. The development and test sets were translated by a professional translator, and we show that Croatian is consistent with other XNLI dubs. The train set is translated using Facebook’s 1.2B parameter m2m_100 model. We thoroughly analyze the Croatian train set and compare its quality with the existing machine-translated German set. The comparison is based on 2000 manually scored sentences per language using a variant of the Direct Assessment (DA) score commonly used at the Conference on Machine Translation (WMT). Our findings reveal that a less-resourced language like Croatian is still lacking in translation quality of longer sentences compared to German. However, both sets have a substantial amount of poor quality translations, which should be considered in translation-based training or evaluation setups.
%R 10.18653/v1/2023.findings-acl.142
%U https://aclanthology.org/2023.findings-acl.142
%U https://doi.org/10.18653/v1/2023.findings-acl.142
%P 2258-2267
Markdown (Informal)
[C-XNLI: Croatian Extension of XNLI Dataset](https://aclanthology.org/2023.findings-acl.142) (Obadić et al., Findings 2023)
ACL
- Leo Obadić, Andrej Jertec, Marko Rajnović, and Branimir Dropuljić. 2023. C-XNLI: Croatian Extension of XNLI Dataset. In Findings of the Association for Computational Linguistics: ACL 2023, pages 2258–2267, Toronto, Canada. Association for Computational Linguistics.