@inproceedings{huang-zhu-2023-statistically,
title = "Statistically Profiling Biases in Natural Language Reasoning Datasets and Models",
author = "Huang, Shanshan and
Zhu, Kenny",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.299",
doi = "10.18653/v1/2023.findings-emnlp.299",
pages = "4521--4530",
abstract = "Recent studies have shown that many natural language understanding and reasoning datasets contain statistical cues that can be exploited by NLP models, resulting in an overestimation of their capabilities. Existing methods, such as {``}hypothesis-only{''} tests and CheckList, are limited in identifying these cues and evaluating model weaknesses. We introduce ICQ (I-See-Cue), a lightweight, general statistical profiling framework that automatically identifies potential biases in multiple-choice NLU datasets without requiring additional test cases. ICQ assesses the extent to which models exploit these biases through black-box testing, addressing the limitations of current methods. In this work, we conduct a comprehensive evaluation of statistical biases in 10 popular NLU datasets and 4 models, confirming prior findings, revealing new insights, and offering an online demonstration system to encourage users to assess their own datasets and models. Furthermore, we present a case study on investigating ChatGPT{'}s bias, providing valuable recommendations for practical applications.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-zhu-2023-statistically">
<titleInfo>
<title>Statistically Profiling Biases in Natural Language Reasoning Datasets and Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shanshan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenny</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent studies have shown that many natural language understanding and reasoning datasets contain statistical cues that can be exploited by NLP models, resulting in an overestimation of their capabilities. Existing methods, such as “hypothesis-only” tests and CheckList, are limited in identifying these cues and evaluating model weaknesses. We introduce ICQ (I-See-Cue), a lightweight, general statistical profiling framework that automatically identifies potential biases in multiple-choice NLU datasets without requiring additional test cases. ICQ assesses the extent to which models exploit these biases through black-box testing, addressing the limitations of current methods. In this work, we conduct a comprehensive evaluation of statistical biases in 10 popular NLU datasets and 4 models, confirming prior findings, revealing new insights, and offering an online demonstration system to encourage users to assess their own datasets and models. Furthermore, we present a case study on investigating ChatGPT’s bias, providing valuable recommendations for practical applications.</abstract>
<identifier type="citekey">huang-zhu-2023-statistically</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.299</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.299</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>4521</start>
<end>4530</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Statistically Profiling Biases in Natural Language Reasoning Datasets and Models
%A Huang, Shanshan
%A Zhu, Kenny
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F huang-zhu-2023-statistically
%X Recent studies have shown that many natural language understanding and reasoning datasets contain statistical cues that can be exploited by NLP models, resulting in an overestimation of their capabilities. Existing methods, such as “hypothesis-only” tests and CheckList, are limited in identifying these cues and evaluating model weaknesses. We introduce ICQ (I-See-Cue), a lightweight, general statistical profiling framework that automatically identifies potential biases in multiple-choice NLU datasets without requiring additional test cases. ICQ assesses the extent to which models exploit these biases through black-box testing, addressing the limitations of current methods. In this work, we conduct a comprehensive evaluation of statistical biases in 10 popular NLU datasets and 4 models, confirming prior findings, revealing new insights, and offering an online demonstration system to encourage users to assess their own datasets and models. Furthermore, we present a case study on investigating ChatGPT’s bias, providing valuable recommendations for practical applications.
%R 10.18653/v1/2023.findings-emnlp.299
%U https://aclanthology.org/2023.findings-emnlp.299
%U https://doi.org/10.18653/v1/2023.findings-emnlp.299
%P 4521-4530
Markdown (Informal)
[Statistically Profiling Biases in Natural Language Reasoning Datasets and Models](https://aclanthology.org/2023.findings-emnlp.299) (Huang & Zhu, Findings 2023)
ACL