@inproceedings{wang-li-2019-bayes,
title = "{B}ayes Test of Precision, Recall, and F1 Measure for Comparison of Two Natural Language Processing Models",
author = "Wang, Ruibo and
Li, Jihong",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'\i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1405",
doi = "10.18653/v1/P19-1405",
pages = "4135--4145",
abstract = "Direct comparison on point estimation of the precision (P), recall (R), and F1 measure of two natural language processing (NLP) models on a common test corpus is unreasonable and results in less replicable conclusions due to a lack of a statistical test. However, the existing t-tests in cross-validation (CV) for model comparison are inappropriate because the distributions of P, R, F1 are skewed and an interval estimation of P, R, and F1 based on a t-test may exceed [0,1]. In this study, we propose to use a block-regularized 3{\mbox{$\times$}}2 CV (3{\mbox{$\times$}}2 BCV) in model comparison because it could regularize the difference in certain frequency distributions over linguistic units between training and validation sets and yield stable estimators of P, R, and F1. On the basis of the 3{\mbox{$\times$}}2 BCV, we calibrate the posterior distributions of P, R, and F1 and derive an accurate interval estimation of P, R, and F1. Furthermore, we formulate the comparison into a hypothesis testing problem and propose a novel Bayes test. The test could directly compute the probabilities of the hypotheses on the basis of the posterior distributions and provide more informative decisions than the existing significance t-tests. Three experiments with regard to NLP chunking tasks are conducted, and the results illustrate the validity of the Bayes test.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-li-2019-bayes">
<titleInfo>
<title>Bayes Test of Precision, Recall, and F1 Measure for Comparison of Two Natural Language Processing Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruibo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jihong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Direct comparison on point estimation of the precision (P), recall (R), and F1 measure of two natural language processing (NLP) models on a common test corpus is unreasonable and results in less replicable conclusions due to a lack of a statistical test. However, the existing t-tests in cross-validation (CV) for model comparison are inappropriate because the distributions of P, R, F1 are skewed and an interval estimation of P, R, and F1 based on a t-test may exceed [0,1]. In this study, we propose to use a block-regularized 3\times2 CV (3\times2 BCV) in model comparison because it could regularize the difference in certain frequency distributions over linguistic units between training and validation sets and yield stable estimators of P, R, and F1. On the basis of the 3\times2 BCV, we calibrate the posterior distributions of P, R, and F1 and derive an accurate interval estimation of P, R, and F1. Furthermore, we formulate the comparison into a hypothesis testing problem and propose a novel Bayes test. The test could directly compute the probabilities of the hypotheses on the basis of the posterior distributions and provide more informative decisions than the existing significance t-tests. Three experiments with regard to NLP chunking tasks are conducted, and the results illustrate the validity of the Bayes test.</abstract>
<identifier type="citekey">wang-li-2019-bayes</identifier>
<identifier type="doi">10.18653/v1/P19-1405</identifier>
<location>
<url>https://aclanthology.org/P19-1405</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>4135</start>
<end>4145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bayes Test of Precision, Recall, and F1 Measure for Comparison of Two Natural Language Processing Models
%A Wang, Ruibo
%A Li, Jihong
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F wang-li-2019-bayes
%X Direct comparison on point estimation of the precision (P), recall (R), and F1 measure of two natural language processing (NLP) models on a common test corpus is unreasonable and results in less replicable conclusions due to a lack of a statistical test. However, the existing t-tests in cross-validation (CV) for model comparison are inappropriate because the distributions of P, R, F1 are skewed and an interval estimation of P, R, and F1 based on a t-test may exceed [0,1]. In this study, we propose to use a block-regularized 3\times2 CV (3\times2 BCV) in model comparison because it could regularize the difference in certain frequency distributions over linguistic units between training and validation sets and yield stable estimators of P, R, and F1. On the basis of the 3\times2 BCV, we calibrate the posterior distributions of P, R, and F1 and derive an accurate interval estimation of P, R, and F1. Furthermore, we formulate the comparison into a hypothesis testing problem and propose a novel Bayes test. The test could directly compute the probabilities of the hypotheses on the basis of the posterior distributions and provide more informative decisions than the existing significance t-tests. Three experiments with regard to NLP chunking tasks are conducted, and the results illustrate the validity of the Bayes test.
%R 10.18653/v1/P19-1405
%U https://aclanthology.org/P19-1405
%U https://doi.org/10.18653/v1/P19-1405
%P 4135-4145
Markdown (Informal)
[Bayes Test of Precision, Recall, and F1 Measure for Comparison of Two Natural Language Processing Models](https://aclanthology.org/P19-1405) (Wang & Li, ACL 2019)
ACL