@inproceedings{ajay-mukund-easwarakumar-2023-dispersed,
title = "Dispersed Hierarchical Attention Network for Machine Translation and Language Understanding on Long Documents with Linear Complexity",
author = "Ajay Mukund, S. and
Easwarakumar, K.s.",
editor = "Jyoti, D. Pawar and
Sobha, Lalitha Devi",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.10",
pages = "90--98",
abstract = "Transformers, being the forefront of Natural Language Processing and a pioneer in the recent developments, we tweak the very fundamentals of the giant Deep Learning model in this paper. For long documents, the conventional Full SelfAttention exceeds the compute power and the memory requirement as it scales quadratically. Instead, if we use a Local Self-Attention using a sliding window, we lose the global context present in the input document which can impact the performance of the task in hand. For long documents (ranging from 500 to 16K tokens), the proposed Dispersed Hierarchical Attention component captures the local context using a sliding window and the global context using a linearlyscaled dispersion approach. This achieves O(N) linear complexity, where N is the length of the input sequence or document.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ajay-mukund-easwarakumar-2023-dispersed">
<titleInfo>
<title>Dispersed Hierarchical Attention Network for Machine Translation and Language Understanding on Long Documents with Linear Complexity</title>
</titleInfo>
<name type="personal">
<namePart type="given">S</namePart>
<namePart type="family">Ajay Mukund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">K.s.</namePart>
<namePart type="family">Easwarakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="given">Pawar</namePart>
<namePart type="family">Jyoti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lalitha</namePart>
<namePart type="given">Devi</namePart>
<namePart type="family">Sobha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformers, being the forefront of Natural Language Processing and a pioneer in the recent developments, we tweak the very fundamentals of the giant Deep Learning model in this paper. For long documents, the conventional Full SelfAttention exceeds the compute power and the memory requirement as it scales quadratically. Instead, if we use a Local Self-Attention using a sliding window, we lose the global context present in the input document which can impact the performance of the task in hand. For long documents (ranging from 500 to 16K tokens), the proposed Dispersed Hierarchical Attention component captures the local context using a sliding window and the global context using a linearlyscaled dispersion approach. This achieves O(N) linear complexity, where N is the length of the input sequence or document.</abstract>
<identifier type="citekey">ajay-mukund-easwarakumar-2023-dispersed</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.10</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>90</start>
<end>98</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dispersed Hierarchical Attention Network for Machine Translation and Language Understanding on Long Documents with Linear Complexity
%A Ajay Mukund, S.
%A Easwarakumar, K.s.
%Y Jyoti, D. Pawar
%Y Sobha, Lalitha Devi
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F ajay-mukund-easwarakumar-2023-dispersed
%X Transformers, being the forefront of Natural Language Processing and a pioneer in the recent developments, we tweak the very fundamentals of the giant Deep Learning model in this paper. For long documents, the conventional Full SelfAttention exceeds the compute power and the memory requirement as it scales quadratically. Instead, if we use a Local Self-Attention using a sliding window, we lose the global context present in the input document which can impact the performance of the task in hand. For long documents (ranging from 500 to 16K tokens), the proposed Dispersed Hierarchical Attention component captures the local context using a sliding window and the global context using a linearlyscaled dispersion approach. This achieves O(N) linear complexity, where N is the length of the input sequence or document.
%U https://aclanthology.org/2023.icon-1.10
%P 90-98
Markdown (Informal)
[Dispersed Hierarchical Attention Network for Machine Translation and Language Understanding on Long Documents with Linear Complexity](https://aclanthology.org/2023.icon-1.10) (Ajay Mukund & Easwarakumar, ICON 2023)
ACL