@inproceedings{zhu-soricut-2021-h,
title = "{H}-Transformer-1{D}: Fast One-Dimensional Hierarchical Attention for Sequences",
author = "Zhu, Zhenhai and
Soricut, Radu",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.294",
doi = "10.18653/v1/2021.acl-long.294",
pages = "3801--3815",
abstract = "We describe an efficient hierarchical method to compute attention in the Transformer architecture. The proposed attention mechanism exploits a matrix structure similar to the Hierarchical Matrix (H-Matrix) developed by the numerical analysis community, and has linear run time and memory complexity. We perform extensive experiments to show that the inductive bias embodied by our hierarchical attention is effective in capturing the hierarchical structure in the sequences typical for natural language and vision tasks. Our method is superior to alternative sub-quadratic proposals by over +6 points on average on the Long Range Arena benchmark. It also sets a new SOTA test perplexity on One-Billion Word dataset with 5x fewer model parameters than that of the previous-best Transformer-based models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhu-soricut-2021-h">
<titleInfo>
<title>H-Transformer-1D: Fast One-Dimensional Hierarchical Attention for Sequences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhenhai</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Soricut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe an efficient hierarchical method to compute attention in the Transformer architecture. The proposed attention mechanism exploits a matrix structure similar to the Hierarchical Matrix (H-Matrix) developed by the numerical analysis community, and has linear run time and memory complexity. We perform extensive experiments to show that the inductive bias embodied by our hierarchical attention is effective in capturing the hierarchical structure in the sequences typical for natural language and vision tasks. Our method is superior to alternative sub-quadratic proposals by over +6 points on average on the Long Range Arena benchmark. It also sets a new SOTA test perplexity on One-Billion Word dataset with 5x fewer model parameters than that of the previous-best Transformer-based models.</abstract>
<identifier type="citekey">zhu-soricut-2021-h</identifier>
<identifier type="doi">10.18653/v1/2021.acl-long.294</identifier>
<location>
<url>https://aclanthology.org/2021.acl-long.294</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>3801</start>
<end>3815</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T H-Transformer-1D: Fast One-Dimensional Hierarchical Attention for Sequences
%A Zhu, Zhenhai
%A Soricut, Radu
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F zhu-soricut-2021-h
%X We describe an efficient hierarchical method to compute attention in the Transformer architecture. The proposed attention mechanism exploits a matrix structure similar to the Hierarchical Matrix (H-Matrix) developed by the numerical analysis community, and has linear run time and memory complexity. We perform extensive experiments to show that the inductive bias embodied by our hierarchical attention is effective in capturing the hierarchical structure in the sequences typical for natural language and vision tasks. Our method is superior to alternative sub-quadratic proposals by over +6 points on average on the Long Range Arena benchmark. It also sets a new SOTA test perplexity on One-Billion Word dataset with 5x fewer model parameters than that of the previous-best Transformer-based models.
%R 10.18653/v1/2021.acl-long.294
%U https://aclanthology.org/2021.acl-long.294
%U https://doi.org/10.18653/v1/2021.acl-long.294
%P 3801-3815
Markdown (Informal)
[H-Transformer-1D: Fast One-Dimensional Hierarchical Attention for Sequences](https://aclanthology.org/2021.acl-long.294) (Zhu & Soricut, ACL-IJCNLP 2021)
ACL