@inproceedings{pham-etal-2024-peeb,
title = "{PEEB}: Part-based Image Classifiers with an Explainable and Editable Language Bottleneck",
author = "Pham, Thang and
Chen, Peijie and
Nguyen, Tin and
Yoon, Seunghyun and
Bui, Trung and
Nguyen, Anh",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.131",
doi = "10.18653/v1/2024.findings-naacl.131",
pages = "2018--2053",
abstract = "CLIP-based classifiers rely on the prompt containing a class name that is known to the text encoder. Therefore, they perform poorly on new classes or the classes whose names rarely appear on the Internet (e.g., scientific names of birds). For fine-grained classification, we propose PEEB {--} an explainable and editable classifier to (1) express the class name into a set of text descriptors that describe the visual parts of that class; and (2) match the embeddings of the detected parts to their textual descriptors in each class to compute a logit score for classification. In a zero-shot setting where the class names are unknown, PEEB outperforms CLIP by a huge margin (∼10{\mbox{$\times$}} in top-1 accuracy). Compared to part-based classifiers, PEEB is not only the state-of-the-art (SOTA) on the supervised-learning setting (88.80{\%} and 92.20{\%} accuracy on CUB-200 and Stanford Dogs-120, respectively) but also the first to enable users to edit the text descriptors to form a new classifier without any re-training. Compared to concept bottleneck models, PEEB is also the SOTA in both zero-shot and supervised-learning settings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pham-etal-2024-peeb">
<titleInfo>
<title>PEEB: Part-based Image Classifiers with an Explainable and Editable Language Bottleneck</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thang</namePart>
<namePart type="family">Pham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peijie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tin</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trung</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>CLIP-based classifiers rely on the prompt containing a class name that is known to the text encoder. Therefore, they perform poorly on new classes or the classes whose names rarely appear on the Internet (e.g., scientific names of birds). For fine-grained classification, we propose PEEB – an explainable and editable classifier to (1) express the class name into a set of text descriptors that describe the visual parts of that class; and (2) match the embeddings of the detected parts to their textual descriptors in each class to compute a logit score for classification. In a zero-shot setting where the class names are unknown, PEEB outperforms CLIP by a huge margin (∼10\times in top-1 accuracy). Compared to part-based classifiers, PEEB is not only the state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% accuracy on CUB-200 and Stanford Dogs-120, respectively) but also the first to enable users to edit the text descriptors to form a new classifier without any re-training. Compared to concept bottleneck models, PEEB is also the SOTA in both zero-shot and supervised-learning settings.</abstract>
<identifier type="citekey">pham-etal-2024-peeb</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.131</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.131</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>2018</start>
<end>2053</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PEEB: Part-based Image Classifiers with an Explainable and Editable Language Bottleneck
%A Pham, Thang
%A Chen, Peijie
%A Nguyen, Tin
%A Yoon, Seunghyun
%A Bui, Trung
%A Nguyen, Anh
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pham-etal-2024-peeb
%X CLIP-based classifiers rely on the prompt containing a class name that is known to the text encoder. Therefore, they perform poorly on new classes or the classes whose names rarely appear on the Internet (e.g., scientific names of birds). For fine-grained classification, we propose PEEB – an explainable and editable classifier to (1) express the class name into a set of text descriptors that describe the visual parts of that class; and (2) match the embeddings of the detected parts to their textual descriptors in each class to compute a logit score for classification. In a zero-shot setting where the class names are unknown, PEEB outperforms CLIP by a huge margin (∼10\times in top-1 accuracy). Compared to part-based classifiers, PEEB is not only the state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% accuracy on CUB-200 and Stanford Dogs-120, respectively) but also the first to enable users to edit the text descriptors to form a new classifier without any re-training. Compared to concept bottleneck models, PEEB is also the SOTA in both zero-shot and supervised-learning settings.
%R 10.18653/v1/2024.findings-naacl.131
%U https://aclanthology.org/2024.findings-naacl.131
%U https://doi.org/10.18653/v1/2024.findings-naacl.131
%P 2018-2053
Markdown (Informal)
[PEEB: Part-based Image Classifiers with an Explainable and Editable Language Bottleneck](https://aclanthology.org/2024.findings-naacl.131) (Pham et al., Findings 2024)
ACL