@inproceedings{leiter-2021-reference,
title = "Reference-Free Word- and Sentence-Level Translation Evaluation with Token-Matching Metrics",
author = "Leiter, Christoph Wolfgang",
editor = "Gao, Yang and
Eger, Steffen and
Zhao, Wei and
Lertvittayakumjorn, Piyawat and
Fomicheva, Marina",
booktitle = "Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eval4nlp-1.16",
doi = "10.18653/v1/2021.eval4nlp-1.16",
pages = "157--164",
abstract = "Many modern machine translation evaluation metrics like BERTScore, BLEURT, COMET, MonoTransquest or XMoverScore are based on black-box language models. Hence, it is difficult to explain why these metrics return certain scores. This year{'}s Eval4NLP shared task tackles this challenge by searching for methods that can extract feature importance scores that correlate well with human word-level error annotations. In this paper we show that unsupervised metrics that are based on tokenmatching can intrinsically provide such scores. The submitted system interprets the similarities of the contextualized word-embeddings that are used to compute (X)BERTScore as word-level importance scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leiter-2021-reference">
<titleInfo>
<title>Reference-Free Word- and Sentence-Level Translation Evaluation with Token-Matching Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="given">Wolfgang</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyawat</namePart>
<namePart type="family">Lertvittayakumjorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Fomicheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many modern machine translation evaluation metrics like BERTScore, BLEURT, COMET, MonoTransquest or XMoverScore are based on black-box language models. Hence, it is difficult to explain why these metrics return certain scores. This year’s Eval4NLP shared task tackles this challenge by searching for methods that can extract feature importance scores that correlate well with human word-level error annotations. In this paper we show that unsupervised metrics that are based on tokenmatching can intrinsically provide such scores. The submitted system interprets the similarities of the contextualized word-embeddings that are used to compute (X)BERTScore as word-level importance scores.</abstract>
<identifier type="citekey">leiter-2021-reference</identifier>
<identifier type="doi">10.18653/v1/2021.eval4nlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2021.eval4nlp-1.16</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>157</start>
<end>164</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reference-Free Word- and Sentence-Level Translation Evaluation with Token-Matching Metrics
%A Leiter, Christoph Wolfgang
%Y Gao, Yang
%Y Eger, Steffen
%Y Zhao, Wei
%Y Lertvittayakumjorn, Piyawat
%Y Fomicheva, Marina
%S Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F leiter-2021-reference
%X Many modern machine translation evaluation metrics like BERTScore, BLEURT, COMET, MonoTransquest or XMoverScore are based on black-box language models. Hence, it is difficult to explain why these metrics return certain scores. This year’s Eval4NLP shared task tackles this challenge by searching for methods that can extract feature importance scores that correlate well with human word-level error annotations. In this paper we show that unsupervised metrics that are based on tokenmatching can intrinsically provide such scores. The submitted system interprets the similarities of the contextualized word-embeddings that are used to compute (X)BERTScore as word-level importance scores.
%R 10.18653/v1/2021.eval4nlp-1.16
%U https://aclanthology.org/2021.eval4nlp-1.16
%U https://doi.org/10.18653/v1/2021.eval4nlp-1.16
%P 157-164
Markdown (Informal)
[Reference-Free Word- and Sentence-Level Translation Evaluation with Token-Matching Metrics](https://aclanthology.org/2021.eval4nlp-1.16) (Leiter, Eval4NLP 2021)
ACL