@inproceedings{rajani-etal-2022-seal,
title = "{SEAL}: Interactive Tool for Systematic Error Analysis and Labeling",
author = "Rajani, Nazneen and
Liang, Weixin and
Chen, Lingjiao and
Mitchell, Margaret and
Zou, James",
editor = "Che, Wanxiang and
Shutova, Ekaterina",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-demos.36",
doi = "10.18653/v1/2022.emnlp-demos.36",
pages = "359--370",
abstract = "With the advent of Transformers, large language models (LLMs) have saturated well-known NLP benchmarks and leaderboards with high aggregate performance. However, many times these models systematically fail on tail data or rare groups not obvious in aggregate evaluation. Identifying such problematic data groups is even more challenging when there are no explicit labels (e.g., ethnicity, gender, etc.) and further compounded for NLP datasets due to the lack of visual features to characterize failure modes (e.g., Asian males, animals indoors, waterbirds on land etc.). This paper introduces an interactive Systematic Error Analysis and Labeling (SEAL) tool that uses a two-step approach to first identify high-error slices of data and then, in the second step, introduce methods to give human-understandable semantics to those underperforming slices. We explore a variety of methods for coming up with coherent semantics for the error groups using language models for semantic labeling and a text-to-image model for generating visual features.SEAL is available at \url{https://huggingface.co/spaces/nazneen/seal}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rajani-etal-2022-seal">
<titleInfo>
<title>SEAL: Interactive Tool for Systematic Error Analysis and Labeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nazneen</namePart>
<namePart type="family">Rajani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weixin</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lingjiao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margaret</namePart>
<namePart type="family">Mitchell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the advent of Transformers, large language models (LLMs) have saturated well-known NLP benchmarks and leaderboards with high aggregate performance. However, many times these models systematically fail on tail data or rare groups not obvious in aggregate evaluation. Identifying such problematic data groups is even more challenging when there are no explicit labels (e.g., ethnicity, gender, etc.) and further compounded for NLP datasets due to the lack of visual features to characterize failure modes (e.g., Asian males, animals indoors, waterbirds on land etc.). This paper introduces an interactive Systematic Error Analysis and Labeling (SEAL) tool that uses a two-step approach to first identify high-error slices of data and then, in the second step, introduce methods to give human-understandable semantics to those underperforming slices. We explore a variety of methods for coming up with coherent semantics for the error groups using language models for semantic labeling and a text-to-image model for generating visual features.SEAL is available at https://huggingface.co/spaces/nazneen/seal.</abstract>
<identifier type="citekey">rajani-etal-2022-seal</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-demos.36</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-demos.36</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>359</start>
<end>370</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SEAL: Interactive Tool for Systematic Error Analysis and Labeling
%A Rajani, Nazneen
%A Liang, Weixin
%A Chen, Lingjiao
%A Mitchell, Margaret
%A Zou, James
%Y Che, Wanxiang
%Y Shutova, Ekaterina
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F rajani-etal-2022-seal
%X With the advent of Transformers, large language models (LLMs) have saturated well-known NLP benchmarks and leaderboards with high aggregate performance. However, many times these models systematically fail on tail data or rare groups not obvious in aggregate evaluation. Identifying such problematic data groups is even more challenging when there are no explicit labels (e.g., ethnicity, gender, etc.) and further compounded for NLP datasets due to the lack of visual features to characterize failure modes (e.g., Asian males, animals indoors, waterbirds on land etc.). This paper introduces an interactive Systematic Error Analysis and Labeling (SEAL) tool that uses a two-step approach to first identify high-error slices of data and then, in the second step, introduce methods to give human-understandable semantics to those underperforming slices. We explore a variety of methods for coming up with coherent semantics for the error groups using language models for semantic labeling and a text-to-image model for generating visual features.SEAL is available at https://huggingface.co/spaces/nazneen/seal.
%R 10.18653/v1/2022.emnlp-demos.36
%U https://aclanthology.org/2022.emnlp-demos.36
%U https://doi.org/10.18653/v1/2022.emnlp-demos.36
%P 359-370
Markdown (Informal)
[SEAL: Interactive Tool for Systematic Error Analysis and Labeling](https://aclanthology.org/2022.emnlp-demos.36) (Rajani et al., EMNLP 2022)
ACL
- Nazneen Rajani, Weixin Liang, Lingjiao Chen, Margaret Mitchell, and James Zou. 2022. SEAL: Interactive Tool for Systematic Error Analysis and Labeling. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 359–370, Abu Dhabi, UAE. Association for Computational Linguistics.