@inproceedings{mohebbi-etal-2023-quantifying,
title = "Quantifying Context Mixing in Transformers",
author = "Mohebbi, Hosein and
Zuidema, Willem and
Chrupa{\l}a, Grzegorz and
Alishahi, Afra",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.245",
doi = "10.18653/v1/2023.eacl-main.245",
pages = "3378--3400",
abstract = "Self-attention weights and their transformed variants have been the main source of information for analyzing token-to-token interactions in Transformer-based models. But despite their ease of interpretation, these weights are not faithful to the models{'} decisions as they are only one part of an encoder, and other components in the encoder layer can have considerable impact on information mixing in the output representations. In this work, by expanding the scope of analysis to the whole encoder block, we propose Value Zeroing, a novel context mixing score customized for Transformers that provides us with a deeper understanding of how information is mixed at each encoder layer. We demonstrate the superiority of our context mixing score over other analysis methods through a series of complementary evaluations with different viewpoints based on linguistically informed rationales, probing, and faithfulness analysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohebbi-etal-2023-quantifying">
<titleInfo>
<title>Quantifying Context Mixing in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willem</namePart>
<namePart type="family">Zuidema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chrupała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Self-attention weights and their transformed variants have been the main source of information for analyzing token-to-token interactions in Transformer-based models. But despite their ease of interpretation, these weights are not faithful to the models’ decisions as they are only one part of an encoder, and other components in the encoder layer can have considerable impact on information mixing in the output representations. In this work, by expanding the scope of analysis to the whole encoder block, we propose Value Zeroing, a novel context mixing score customized for Transformers that provides us with a deeper understanding of how information is mixed at each encoder layer. We demonstrate the superiority of our context mixing score over other analysis methods through a series of complementary evaluations with different viewpoints based on linguistically informed rationales, probing, and faithfulness analysis.</abstract>
<identifier type="citekey">mohebbi-etal-2023-quantifying</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.245</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.245</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>3378</start>
<end>3400</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quantifying Context Mixing in Transformers
%A Mohebbi, Hosein
%A Zuidema, Willem
%A Chrupała, Grzegorz
%A Alishahi, Afra
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F mohebbi-etal-2023-quantifying
%X Self-attention weights and their transformed variants have been the main source of information for analyzing token-to-token interactions in Transformer-based models. But despite their ease of interpretation, these weights are not faithful to the models’ decisions as they are only one part of an encoder, and other components in the encoder layer can have considerable impact on information mixing in the output representations. In this work, by expanding the scope of analysis to the whole encoder block, we propose Value Zeroing, a novel context mixing score customized for Transformers that provides us with a deeper understanding of how information is mixed at each encoder layer. We demonstrate the superiority of our context mixing score over other analysis methods through a series of complementary evaluations with different viewpoints based on linguistically informed rationales, probing, and faithfulness analysis.
%R 10.18653/v1/2023.eacl-main.245
%U https://aclanthology.org/2023.eacl-main.245
%U https://doi.org/10.18653/v1/2023.eacl-main.245
%P 3378-3400
Markdown (Informal)
[Quantifying Context Mixing in Transformers](https://aclanthology.org/2023.eacl-main.245) (Mohebbi et al., EACL 2023)
ACL
- Hosein Mohebbi, Willem Zuidema, Grzegorz Chrupała, and Afra Alishahi. 2023. Quantifying Context Mixing in Transformers. In Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pages 3378–3400, Dubrovnik, Croatia. Association for Computational Linguistics.