@inproceedings{sekhon-etal-2022-white,
title = "White-box Testing of {NLP} models with Mask Neuron Coverage",
author = "Sekhon, Arshdeep and
Ji, Yangfeng and
Dwyer, Matthew and
Qi, Yanjun",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-naacl.116",
doi = "10.18653/v1/2022.findings-naacl.116",
pages = "1547--1558",
abstract = "Recent literature has seen growing interest in using black-box strategies like for testing the behavior of NLP models. Research on white-box testing has developed a number of methods for evaluatinghow thoroughly the internal behavior of deep models is tested, but they are not applicableto NLP models. We propose a set of white-box testing methods that are customized for transformer-based NLP models. These include MASK NEURON COVERAGE (MNCOVER) that measures how thoroughlythe attention layers in models are exercised during testing. We show that MNCOVER can refine testing suites generated by CheckList by substantiallyreduce them in size, for more than 60{\%} on average, while retaining failing tests {--} thereby concentrating the faultdetection power of the test suite. Further we show how can be used to guide CheckList input generation,evaluate alternative NLP testing methods, and drive data augmentation to improve accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sekhon-etal-2022-white">
<titleInfo>
<title>White-box Testing of NLP models with Mask Neuron Coverage</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arshdeep</namePart>
<namePart type="family">Sekhon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangfeng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Dwyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanjun</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent literature has seen growing interest in using black-box strategies like for testing the behavior of NLP models. Research on white-box testing has developed a number of methods for evaluatinghow thoroughly the internal behavior of deep models is tested, but they are not applicableto NLP models. We propose a set of white-box testing methods that are customized for transformer-based NLP models. These include MASK NEURON COVERAGE (MNCOVER) that measures how thoroughlythe attention layers in models are exercised during testing. We show that MNCOVER can refine testing suites generated by CheckList by substantiallyreduce them in size, for more than 60% on average, while retaining failing tests – thereby concentrating the faultdetection power of the test suite. Further we show how can be used to guide CheckList input generation,evaluate alternative NLP testing methods, and drive data augmentation to improve accuracy.</abstract>
<identifier type="citekey">sekhon-etal-2022-white</identifier>
<identifier type="doi">10.18653/v1/2022.findings-naacl.116</identifier>
<location>
<url>https://aclanthology.org/2022.findings-naacl.116</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>1547</start>
<end>1558</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T White-box Testing of NLP models with Mask Neuron Coverage
%A Sekhon, Arshdeep
%A Ji, Yangfeng
%A Dwyer, Matthew
%A Qi, Yanjun
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Findings of the Association for Computational Linguistics: NAACL 2022
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F sekhon-etal-2022-white
%X Recent literature has seen growing interest in using black-box strategies like for testing the behavior of NLP models. Research on white-box testing has developed a number of methods for evaluatinghow thoroughly the internal behavior of deep models is tested, but they are not applicableto NLP models. We propose a set of white-box testing methods that are customized for transformer-based NLP models. These include MASK NEURON COVERAGE (MNCOVER) that measures how thoroughlythe attention layers in models are exercised during testing. We show that MNCOVER can refine testing suites generated by CheckList by substantiallyreduce them in size, for more than 60% on average, while retaining failing tests – thereby concentrating the faultdetection power of the test suite. Further we show how can be used to guide CheckList input generation,evaluate alternative NLP testing methods, and drive data augmentation to improve accuracy.
%R 10.18653/v1/2022.findings-naacl.116
%U https://aclanthology.org/2022.findings-naacl.116
%U https://doi.org/10.18653/v1/2022.findings-naacl.116
%P 1547-1558
Markdown (Informal)
[White-box Testing of NLP models with Mask Neuron Coverage](https://aclanthology.org/2022.findings-naacl.116) (Sekhon et al., Findings 2022)
ACL