@inproceedings{yacouby-axman-2020-probabilistic,
title = "Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models",
author = "Yacouby, Reda and
Axman, Dustin",
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.eval4nlp-1.9/",
doi = "10.18653/v1/2020.eval4nlp-1.9",
pages = "79--91",
abstract = "In pursuit of the perfect supervised NLP classifier, razor thin margins and low-resource test sets can make modeling decisions difficult. Popular metrics such as Accuracy, Precision, and Recall are often insufficient as they fail to give a complete picture of the model`s behavior. We present a probabilistic extension of Precision, Recall, and F1 score, which we refer to as confidence-Precision (cPrecision), confidence-Recall (cRecall), and confidence-F1 (cF1) respectively. The proposed metrics address some of the challenges faced when evaluating large-scale NLP systems, specifically when the model`s confidence score assignments have an impact on the system`s behavior. We describe four key benefits of our proposed metrics as compared to their threshold-based counterparts. Two of these benefits, which we refer to as robustness to missing values and sensitivity to model confidence score assignments are self-evident from the metrics' definitions; the remaining benefits, generalization, and functional consistency are demonstrated empirically."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yacouby-axman-2020-probabilistic">
<titleInfo>
<title>Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reda</namePart>
<namePart type="family">Yacouby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dustin</namePart>
<namePart type="family">Axman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxime</namePart>
<namePart type="family">Peyrard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In pursuit of the perfect supervised NLP classifier, razor thin margins and low-resource test sets can make modeling decisions difficult. Popular metrics such as Accuracy, Precision, and Recall are often insufficient as they fail to give a complete picture of the model‘s behavior. We present a probabilistic extension of Precision, Recall, and F1 score, which we refer to as confidence-Precision (cPrecision), confidence-Recall (cRecall), and confidence-F1 (cF1) respectively. The proposed metrics address some of the challenges faced when evaluating large-scale NLP systems, specifically when the model‘s confidence score assignments have an impact on the system‘s behavior. We describe four key benefits of our proposed metrics as compared to their threshold-based counterparts. Two of these benefits, which we refer to as robustness to missing values and sensitivity to model confidence score assignments are self-evident from the metrics’ definitions; the remaining benefits, generalization, and functional consistency are demonstrated empirically.</abstract>
<identifier type="citekey">yacouby-axman-2020-probabilistic</identifier>
<identifier type="doi">10.18653/v1/2020.eval4nlp-1.9</identifier>
<location>
<url>https://aclanthology.org/2020.eval4nlp-1.9/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>79</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models
%A Yacouby, Reda
%A Axman, Dustin
%Y Eger, Steffen
%Y Gao, Yang
%Y Peyrard, Maxime
%Y Zhao, Wei
%Y Hovy, Eduard
%S Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F yacouby-axman-2020-probabilistic
%X In pursuit of the perfect supervised NLP classifier, razor thin margins and low-resource test sets can make modeling decisions difficult. Popular metrics such as Accuracy, Precision, and Recall are often insufficient as they fail to give a complete picture of the model‘s behavior. We present a probabilistic extension of Precision, Recall, and F1 score, which we refer to as confidence-Precision (cPrecision), confidence-Recall (cRecall), and confidence-F1 (cF1) respectively. The proposed metrics address some of the challenges faced when evaluating large-scale NLP systems, specifically when the model‘s confidence score assignments have an impact on the system‘s behavior. We describe four key benefits of our proposed metrics as compared to their threshold-based counterparts. Two of these benefits, which we refer to as robustness to missing values and sensitivity to model confidence score assignments are self-evident from the metrics’ definitions; the remaining benefits, generalization, and functional consistency are demonstrated empirically.
%R 10.18653/v1/2020.eval4nlp-1.9
%U https://aclanthology.org/2020.eval4nlp-1.9/
%U https://doi.org/10.18653/v1/2020.eval4nlp-1.9
%P 79-91
Markdown (Informal)
[Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models](https://aclanthology.org/2020.eval4nlp-1.9/) (Yacouby & Axman, Eval4NLP 2020)
ACL