@article{hua-etal-2023-discover,
title = "Discover, Explain, Improve: An Automatic Slice Detection Benchmark for Natural Language Processing",
author = "Hua, Wenyue and
Jin, Lifeng and
Song, Linfeng and
Mi, Haitao and
Zhang, Yongfeng and
Yu, Dong",
journal = "Transactions of the Association for Computational Linguistics",
volume = "11",
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.tacl-1.87",
doi = "10.1162/tacl_a_00617",
pages = "1537--1552",
abstract = "Pretrained natural language processing (NLP) models have achieved high overall performance, but they still make systematic errors. Instead of manual error analysis, research on slice detection models (SDMs), which automatically identify underperforming groups of datapoints, has caught escalated attention in Computer Vision for both understanding model behaviors and providing insights for future model training and designing. However, little research on SDMs and quantitative evaluation of their effectiveness have been conducted on NLP tasks. Our paper fills the gap by proposing a benchmark named {``}Discover, Explain, Improve (DEIm){''} for classification NLP tasks along with a new SDM Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIm then unites them under human-understandable concepts and provides comprehensive evaluation tasks and corresponding quantitative metrics. The evaluation in DEIm shows that Edisa can accurately select error-prone datapoints with informative semantic features that summarize error patterns. Detecting difficult datapoints directly boosts model performance without tuning any original model parameters, showing that discovered slices are actionable for users.1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hua-etal-2023-discover">
<titleInfo>
<title>Discover, Explain, Improve: An Automatic Slice Detection Benchmark for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenyue</namePart>
<namePart type="family">Hua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lifeng</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linfeng</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Mi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongfeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Pretrained natural language processing (NLP) models have achieved high overall performance, but they still make systematic errors. Instead of manual error analysis, research on slice detection models (SDMs), which automatically identify underperforming groups of datapoints, has caught escalated attention in Computer Vision for both understanding model behaviors and providing insights for future model training and designing. However, little research on SDMs and quantitative evaluation of their effectiveness have been conducted on NLP tasks. Our paper fills the gap by proposing a benchmark named “Discover, Explain, Improve (DEIm)” for classification NLP tasks along with a new SDM Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIm then unites them under human-understandable concepts and provides comprehensive evaluation tasks and corresponding quantitative metrics. The evaluation in DEIm shows that Edisa can accurately select error-prone datapoints with informative semantic features that summarize error patterns. Detecting difficult datapoints directly boosts model performance without tuning any original model parameters, showing that discovered slices are actionable for users.1</abstract>
<identifier type="citekey">hua-etal-2023-discover</identifier>
<identifier type="doi">10.1162/tacl_a_00617</identifier>
<location>
<url>https://aclanthology.org/2023.tacl-1.87</url>
</location>
<part>
<date>2023</date>
<detail type="volume"><number>11</number></detail>
<extent unit="page">
<start>1537</start>
<end>1552</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Discover, Explain, Improve: An Automatic Slice Detection Benchmark for Natural Language Processing
%A Hua, Wenyue
%A Jin, Lifeng
%A Song, Linfeng
%A Mi, Haitao
%A Zhang, Yongfeng
%A Yu, Dong
%J Transactions of the Association for Computational Linguistics
%D 2023
%V 11
%I MIT Press
%C Cambridge, MA
%F hua-etal-2023-discover
%X Pretrained natural language processing (NLP) models have achieved high overall performance, but they still make systematic errors. Instead of manual error analysis, research on slice detection models (SDMs), which automatically identify underperforming groups of datapoints, has caught escalated attention in Computer Vision for both understanding model behaviors and providing insights for future model training and designing. However, little research on SDMs and quantitative evaluation of their effectiveness have been conducted on NLP tasks. Our paper fills the gap by proposing a benchmark named “Discover, Explain, Improve (DEIm)” for classification NLP tasks along with a new SDM Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIm then unites them under human-understandable concepts and provides comprehensive evaluation tasks and corresponding quantitative metrics. The evaluation in DEIm shows that Edisa can accurately select error-prone datapoints with informative semantic features that summarize error patterns. Detecting difficult datapoints directly boosts model performance without tuning any original model parameters, showing that discovered slices are actionable for users.1
%R 10.1162/tacl_a_00617
%U https://aclanthology.org/2023.tacl-1.87
%U https://doi.org/10.1162/tacl_a_00617
%P 1537-1552
Markdown (Informal)
[Discover, Explain, Improve: An Automatic Slice Detection Benchmark for Natural Language Processing](https://aclanthology.org/2023.tacl-1.87) (Hua et al., TACL 2023)
ACL