@inproceedings{treviso-etal-2022-predicting,
title = "Predicting Attention Sparsity in Transformers",
author = "Treviso, Marcos and
G{\'o}is, Ant{\'o}nio and
Fernandes, Patrick and
Fonseca, Erick and
Martins, Andre",
editor = "Vlachos, Andreas and
Agrawal, Priyanka and
Martins, Andr{\'e} and
Lampouras, Gerasimos and
Lyu, Chunchuan",
booktitle = "Proceedings of the Sixth Workshop on Structured Prediction for NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.spnlp-1.7",
doi = "10.18653/v1/2022.spnlp-1.7",
pages = "67--81",
abstract = "Transformers{'} quadratic complexity with respect to the input sequence length has motivated a body of work on efficient sparse approximations to softmax. An alternative path, used by entmax transformers, consists of having built-in exact sparse attention; however this approach still requires quadratic computation. In this paper, we propose Sparsefinder, a simple model trained to identify the sparsity pattern of entmax attention before computing it. We experiment with three variants of our method, based on distances, quantization, and clustering, on two tasks: machine translation (attention in the decoder) and masked language modeling (encoder-only). Our work provides a new angle to study model efficiency by doing extensive analysis of the tradeoff between the sparsity and recall of the predicted attention graph. This allows for detailed comparison between different models along their Pareto curves, important to guide future benchmarks for sparse attention models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="treviso-etal-2022-predicting">
<titleInfo>
<title>Predicting Attention Sparsity in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Treviso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">António</namePart>
<namePart type="family">Góis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Fernandes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erick</namePart>
<namePart type="family">Fonseca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Structured Prediction for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priyanka</namePart>
<namePart type="family">Agrawal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Lampouras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunchuan</namePart>
<namePart type="family">Lyu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformers’ quadratic complexity with respect to the input sequence length has motivated a body of work on efficient sparse approximations to softmax. An alternative path, used by entmax transformers, consists of having built-in exact sparse attention; however this approach still requires quadratic computation. In this paper, we propose Sparsefinder, a simple model trained to identify the sparsity pattern of entmax attention before computing it. We experiment with three variants of our method, based on distances, quantization, and clustering, on two tasks: machine translation (attention in the decoder) and masked language modeling (encoder-only). Our work provides a new angle to study model efficiency by doing extensive analysis of the tradeoff between the sparsity and recall of the predicted attention graph. This allows for detailed comparison between different models along their Pareto curves, important to guide future benchmarks for sparse attention models.</abstract>
<identifier type="citekey">treviso-etal-2022-predicting</identifier>
<identifier type="doi">10.18653/v1/2022.spnlp-1.7</identifier>
<location>
<url>https://aclanthology.org/2022.spnlp-1.7</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>67</start>
<end>81</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Predicting Attention Sparsity in Transformers
%A Treviso, Marcos
%A Góis, António
%A Fernandes, Patrick
%A Fonseca, Erick
%A Martins, Andre
%Y Vlachos, Andreas
%Y Agrawal, Priyanka
%Y Martins, André
%Y Lampouras, Gerasimos
%Y Lyu, Chunchuan
%S Proceedings of the Sixth Workshop on Structured Prediction for NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F treviso-etal-2022-predicting
%X Transformers’ quadratic complexity with respect to the input sequence length has motivated a body of work on efficient sparse approximations to softmax. An alternative path, used by entmax transformers, consists of having built-in exact sparse attention; however this approach still requires quadratic computation. In this paper, we propose Sparsefinder, a simple model trained to identify the sparsity pattern of entmax attention before computing it. We experiment with three variants of our method, based on distances, quantization, and clustering, on two tasks: machine translation (attention in the decoder) and masked language modeling (encoder-only). Our work provides a new angle to study model efficiency by doing extensive analysis of the tradeoff between the sparsity and recall of the predicted attention graph. This allows for detailed comparison between different models along their Pareto curves, important to guide future benchmarks for sparse attention models.
%R 10.18653/v1/2022.spnlp-1.7
%U https://aclanthology.org/2022.spnlp-1.7
%U https://doi.org/10.18653/v1/2022.spnlp-1.7
%P 67-81
Markdown (Informal)
[Predicting Attention Sparsity in Transformers](https://aclanthology.org/2022.spnlp-1.7) (Treviso et al., spnlp 2022)
ACL
- Marcos Treviso, António Góis, Patrick Fernandes, Erick Fonseca, and Andre Martins. 2022. Predicting Attention Sparsity in Transformers. In Proceedings of the Sixth Workshop on Structured Prediction for NLP, pages 67–81, Dublin, Ireland. Association for Computational Linguistics.