@inproceedings{kim-kim-2022-vacillating,
title = "Vacillating Human Correlation of {S}acre{BLEU} in Unprotected Languages",
author = "Kim, Ahrii and
Kim, Jinhyeon",
editor = "Belz, Anya and
Popovi{\'c}, Maja and
Reiter, Ehud and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.humeval-1.1",
doi = "10.18653/v1/2022.humeval-1.1",
pages = "1--15",
abstract = "SacreBLEU, by incorporating a text normalizing step in the pipeline, has become a rising automatic evaluation metric in recent MT studies. With agglutinative languages such as Korean, however, the lexical-level metric cannot provide a conceivable result without a customized pre-tokenization. This paper endeavors to ex- amine the influence of diversified tokenization schemes {--}word, morpheme, subword, character, and consonants {\&} vowels (CV){--} on the metric after its protective layer is peeled off. By performing meta-evaluation with manually- constructed into-Korean resources, our empirical study demonstrates that the human correlation of the surface-based metric and other homogeneous ones (as an extension) vacillates greatly by the token type. Moreover, the human correlation of the metric often deteriorates due to some tokenization, with CV one of its culprits. Guiding through the proper usage of tokenizers for the given metric, we discover i) the feasibility of the character tokens and ii) the deficit of CV in the Korean MT evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-kim-2022-vacillating">
<titleInfo>
<title>Vacillating Human Correlation of SacreBLEU in Unprotected Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahrii</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinhyeon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>SacreBLEU, by incorporating a text normalizing step in the pipeline, has become a rising automatic evaluation metric in recent MT studies. With agglutinative languages such as Korean, however, the lexical-level metric cannot provide a conceivable result without a customized pre-tokenization. This paper endeavors to ex- amine the influence of diversified tokenization schemes –word, morpheme, subword, character, and consonants & vowels (CV)– on the metric after its protective layer is peeled off. By performing meta-evaluation with manually- constructed into-Korean resources, our empirical study demonstrates that the human correlation of the surface-based metric and other homogeneous ones (as an extension) vacillates greatly by the token type. Moreover, the human correlation of the metric often deteriorates due to some tokenization, with CV one of its culprits. Guiding through the proper usage of tokenizers for the given metric, we discover i) the feasibility of the character tokens and ii) the deficit of CV in the Korean MT evaluation.</abstract>
<identifier type="citekey">kim-kim-2022-vacillating</identifier>
<identifier type="doi">10.18653/v1/2022.humeval-1.1</identifier>
<location>
<url>https://aclanthology.org/2022.humeval-1.1</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>1</start>
<end>15</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vacillating Human Correlation of SacreBLEU in Unprotected Languages
%A Kim, Ahrii
%A Kim, Jinhyeon
%Y Belz, Anya
%Y Popović, Maja
%Y Reiter, Ehud
%Y Shimorina, Anastasia
%S Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F kim-kim-2022-vacillating
%X SacreBLEU, by incorporating a text normalizing step in the pipeline, has become a rising automatic evaluation metric in recent MT studies. With agglutinative languages such as Korean, however, the lexical-level metric cannot provide a conceivable result without a customized pre-tokenization. This paper endeavors to ex- amine the influence of diversified tokenization schemes –word, morpheme, subword, character, and consonants & vowels (CV)– on the metric after its protective layer is peeled off. By performing meta-evaluation with manually- constructed into-Korean resources, our empirical study demonstrates that the human correlation of the surface-based metric and other homogeneous ones (as an extension) vacillates greatly by the token type. Moreover, the human correlation of the metric often deteriorates due to some tokenization, with CV one of its culprits. Guiding through the proper usage of tokenizers for the given metric, we discover i) the feasibility of the character tokens and ii) the deficit of CV in the Korean MT evaluation.
%R 10.18653/v1/2022.humeval-1.1
%U https://aclanthology.org/2022.humeval-1.1
%U https://doi.org/10.18653/v1/2022.humeval-1.1
%P 1-15
Markdown (Informal)
[Vacillating Human Correlation of SacreBLEU in Unprotected Languages](https://aclanthology.org/2022.humeval-1.1) (Kim & Kim, HumEval 2022)
ACL