@inproceedings{arendt-etal-2021-crosscheck,
title = "{C}ross{C}heck: Rapid, Reproducible, and Interpretable Model Evaluation",
author = "Arendt, Dustin and
Shaw, Zhuanyi and
Shrestha, Prasha and
Ayton, Ellyn and
Glenski, Maria and
Volkova, Svitlana",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan",
booktitle = "Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dash-1.13",
doi = "10.18653/v1/2021.dash-1.13",
pages = "79--85",
abstract = "Evaluation beyond aggregate performance metrics, e.g. F1-score, is crucial to both establish an appropriate level of trust in machine learning models and identify avenues for future model improvements. In this paper we demonstrate CrossCheck, an interactive capability for rapid cross-model comparison and reproducible error analysis. We describe the tool, discuss design and implementation details, and present three NLP use cases {--} named entity recognition, reading comprehension, and clickbait detection that show the benefits of using the tool for model evaluation. CrossCheck enables users to make informed decisions when choosing between multiple models, identify when the models are correct and for which examples, investigate whether the models are making the same mistakes as humans, evaluate models{'} generalizability and highlight models{'} limitations, strengths and weaknesses. Furthermore, CrossCheck is implemented as a Jupyter widget, which allows for rapid and convenient integration into existing model development workflows.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arendt-etal-2021-crosscheck">
<titleInfo>
<title>CrossCheck: Rapid, Reproducible, and Interpretable Model Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dustin</namePart>
<namePart type="family">Arendt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuanyi</namePart>
<namePart type="family">Shaw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasha</namePart>
<namePart type="family">Shrestha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ellyn</namePart>
<namePart type="family">Ayton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Glenski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Svitlana</namePart>
<namePart type="family">Volkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation beyond aggregate performance metrics, e.g. F1-score, is crucial to both establish an appropriate level of trust in machine learning models and identify avenues for future model improvements. In this paper we demonstrate CrossCheck, an interactive capability for rapid cross-model comparison and reproducible error analysis. We describe the tool, discuss design and implementation details, and present three NLP use cases – named entity recognition, reading comprehension, and clickbait detection that show the benefits of using the tool for model evaluation. CrossCheck enables users to make informed decisions when choosing between multiple models, identify when the models are correct and for which examples, investigate whether the models are making the same mistakes as humans, evaluate models’ generalizability and highlight models’ limitations, strengths and weaknesses. Furthermore, CrossCheck is implemented as a Jupyter widget, which allows for rapid and convenient integration into existing model development workflows.</abstract>
<identifier type="citekey">arendt-etal-2021-crosscheck</identifier>
<identifier type="doi">10.18653/v1/2021.dash-1.13</identifier>
<location>
<url>https://aclanthology.org/2021.dash-1.13</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>79</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CrossCheck: Rapid, Reproducible, and Interpretable Model Evaluation
%A Arendt, Dustin
%A Shaw, Zhuanyi
%A Shrestha, Prasha
%A Ayton, Ellyn
%A Glenski, Maria
%A Volkova, Svitlana
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%S Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F arendt-etal-2021-crosscheck
%X Evaluation beyond aggregate performance metrics, e.g. F1-score, is crucial to both establish an appropriate level of trust in machine learning models and identify avenues for future model improvements. In this paper we demonstrate CrossCheck, an interactive capability for rapid cross-model comparison and reproducible error analysis. We describe the tool, discuss design and implementation details, and present three NLP use cases – named entity recognition, reading comprehension, and clickbait detection that show the benefits of using the tool for model evaluation. CrossCheck enables users to make informed decisions when choosing between multiple models, identify when the models are correct and for which examples, investigate whether the models are making the same mistakes as humans, evaluate models’ generalizability and highlight models’ limitations, strengths and weaknesses. Furthermore, CrossCheck is implemented as a Jupyter widget, which allows for rapid and convenient integration into existing model development workflows.
%R 10.18653/v1/2021.dash-1.13
%U https://aclanthology.org/2021.dash-1.13
%U https://doi.org/10.18653/v1/2021.dash-1.13
%P 79-85
Markdown (Informal)
[CrossCheck: Rapid, Reproducible, and Interpretable Model Evaluation](https://aclanthology.org/2021.dash-1.13) (Arendt et al., DaSH 2021)
ACL