@inproceedings{andrade-etal-2024-explaining,
title = "Explaining the Hardest Errors of Contextual Embedding Based Classifiers",
author = "Andrade, Claudio Mois{\'e}s Valiense De and
Cunha, Washington and
Fonseca, Guilherme and
Pagano, Ana Clara Souza and
Santos, Luana De Castro and
Pagano, Adriana Silvina and
Rocha, Leonardo Chaves Dutra Da and
Gon{\c{c}}alves, Marcos Andr{\'e}",
editor = "Barak, Libby and
Alikhani, Malihe",
booktitle = "Proceedings of the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-1.31",
pages = "419--434",
abstract = "We seek to explain the causes of the misclassification of the most challenging documents, namely those that no classifier using state-of-the-art, very semantically-separable contextual embedding representations managed to predict accurately. To do so, we propose a taxonomy of incorrect predictions, which we used to perform qualitative human evaluation. We posed two (research) questions, considering three sentiment datasets in two different domains {--} movie and product reviews. Evaluators with two different backgrounds evaluated documents by comparing the predominant sentiment assigned by the model to the label in the gold dataset in order to decide on a likely misclassification reason. Based on a high inter-evaluator agreement (81.7{\%}), we observed significant differences between the product and movie review domains, such as the prevalence of ambivalence in product reviews and sarcasm in movie reviews. Our analysis also revealed an unexpectedly high rate of incorrect labeling in the gold dataset (up to 33{\%}) and a significant amount of incorrect prediction by the model due to a series of linguistic phenomena (including amplified words, contrastive markers, comparative sentences, and references to world knowledge). Overall, our taxonomy and methodology allow us to explain between 80{\%}-85{\%} of the errors with high confidence (agreement) {--} enabling us to point out where future efforts to improve models should be concentrated.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="andrade-etal-2024-explaining">
<titleInfo>
<title>Explaining the Hardest Errors of Contextual Embedding Based Classifiers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="given">Moisés</namePart>
<namePart type="given">Valiense</namePart>
<namePart type="given">De</namePart>
<namePart type="family">Andrade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Washington</namePart>
<namePart type="family">Cunha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guilherme</namePart>
<namePart type="family">Fonseca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="given">Clara</namePart>
<namePart type="given">Souza</namePart>
<namePart type="family">Pagano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luana</namePart>
<namePart type="given">De</namePart>
<namePart type="given">Castro</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adriana</namePart>
<namePart type="given">Silvina</namePart>
<namePart type="family">Pagano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="given">Chaves</namePart>
<namePart type="given">Dutra</namePart>
<namePart type="given">Da</namePart>
<namePart type="family">Rocha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="given">André</namePart>
<namePart type="family">Gonçalves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Libby</namePart>
<namePart type="family">Barak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We seek to explain the causes of the misclassification of the most challenging documents, namely those that no classifier using state-of-the-art, very semantically-separable contextual embedding representations managed to predict accurately. To do so, we propose a taxonomy of incorrect predictions, which we used to perform qualitative human evaluation. We posed two (research) questions, considering three sentiment datasets in two different domains – movie and product reviews. Evaluators with two different backgrounds evaluated documents by comparing the predominant sentiment assigned by the model to the label in the gold dataset in order to decide on a likely misclassification reason. Based on a high inter-evaluator agreement (81.7%), we observed significant differences between the product and movie review domains, such as the prevalence of ambivalence in product reviews and sarcasm in movie reviews. Our analysis also revealed an unexpectedly high rate of incorrect labeling in the gold dataset (up to 33%) and a significant amount of incorrect prediction by the model due to a series of linguistic phenomena (including amplified words, contrastive markers, comparative sentences, and references to world knowledge). Overall, our taxonomy and methodology allow us to explain between 80%-85% of the errors with high confidence (agreement) – enabling us to point out where future efforts to improve models should be concentrated.</abstract>
<identifier type="citekey">andrade-etal-2024-explaining</identifier>
<location>
<url>https://aclanthology.org/2024.conll-1.31</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>419</start>
<end>434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Explaining the Hardest Errors of Contextual Embedding Based Classifiers
%A Andrade, Claudio Moisés Valiense De
%A Cunha, Washington
%A Fonseca, Guilherme
%A Pagano, Ana Clara Souza
%A Santos, Luana De Castro
%A Pagano, Adriana Silvina
%A Rocha, Leonardo Chaves Dutra Da
%A Gonçalves, Marcos André
%Y Barak, Libby
%Y Alikhani, Malihe
%S Proceedings of the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F andrade-etal-2024-explaining
%X We seek to explain the causes of the misclassification of the most challenging documents, namely those that no classifier using state-of-the-art, very semantically-separable contextual embedding representations managed to predict accurately. To do so, we propose a taxonomy of incorrect predictions, which we used to perform qualitative human evaluation. We posed two (research) questions, considering three sentiment datasets in two different domains – movie and product reviews. Evaluators with two different backgrounds evaluated documents by comparing the predominant sentiment assigned by the model to the label in the gold dataset in order to decide on a likely misclassification reason. Based on a high inter-evaluator agreement (81.7%), we observed significant differences between the product and movie review domains, such as the prevalence of ambivalence in product reviews and sarcasm in movie reviews. Our analysis also revealed an unexpectedly high rate of incorrect labeling in the gold dataset (up to 33%) and a significant amount of incorrect prediction by the model due to a series of linguistic phenomena (including amplified words, contrastive markers, comparative sentences, and references to world knowledge). Overall, our taxonomy and methodology allow us to explain between 80%-85% of the errors with high confidence (agreement) – enabling us to point out where future efforts to improve models should be concentrated.
%U https://aclanthology.org/2024.conll-1.31
%P 419-434
Markdown (Informal)
[Explaining the Hardest Errors of Contextual Embedding Based Classifiers](https://aclanthology.org/2024.conll-1.31) (Andrade et al., CoNLL 2024)
ACL
- Claudio Moisés Valiense De Andrade, Washington Cunha, Guilherme Fonseca, Ana Clara Souza Pagano, Luana De Castro Santos, Adriana Silvina Pagano, Leonardo Chaves Dutra Da Rocha, and Marcos André Gonçalves. 2024. Explaining the Hardest Errors of Contextual Embedding Based Classifiers. In Proceedings of the 28th Conference on Computational Natural Language Learning, pages 419–434, Miami, FL, USA. Association for Computational Linguistics.