@inproceedings{kowsher-etal-2025-self,
title = "Does Self-Attention Need Separate Weights in Transformers?",
author = "Kowsher, Md and
Prottasha, Nusrat Jahan and
Yu, Chun-Nam and
Garibay, Ozlem and
Yousefi, Niloofar",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-industry.44/",
doi = "10.18653/v1/2025.naacl-industry.44",
pages = "535--543",
ISBN = "979-8-89176-194-0",
abstract = "Self-attention has revolutionized natural language processing by capturing long-range dependencies and improving context understanding. However, it comes with high computational costs and struggles with sequential data{'}s inherent directionality. This paper investigates and presents a simplified approach called ``shared weight self-attention,'' where a single weight matrix is used for Keys, Queries, and Values instead of separate matrices for each. This approach cuts training parameters by more than half and significantly reduces training time. Our method not only improves efficiency but also achieves strong performance on tasks from the GLUE benchmark, even outperforming the standard BERT baseline in handling noisy and out-of-domain data. Experimental results show a 66.53{\%} reduction in parameter size within the attention block and competitive accuracy improvements of 3.55{\%} and 0.89{\%} over symmetric and pairwise attention-based BERT models, respectively."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kowsher-etal-2025-self">
<titleInfo>
<title>Does Self-Attention Need Separate Weights in Transformers?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Kowsher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nusrat</namePart>
<namePart type="given">Jahan</namePart>
<namePart type="family">Prottasha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chun-Nam</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ozlem</namePart>
<namePart type="family">Garibay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niloofar</namePart>
<namePart type="family">Yousefi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weizhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Kachuee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue-Yong</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-194-0</identifier>
</relatedItem>
<abstract>Self-attention has revolutionized natural language processing by capturing long-range dependencies and improving context understanding. However, it comes with high computational costs and struggles with sequential data’s inherent directionality. This paper investigates and presents a simplified approach called “shared weight self-attention,” where a single weight matrix is used for Keys, Queries, and Values instead of separate matrices for each. This approach cuts training parameters by more than half and significantly reduces training time. Our method not only improves efficiency but also achieves strong performance on tasks from the GLUE benchmark, even outperforming the standard BERT baseline in handling noisy and out-of-domain data. Experimental results show a 66.53% reduction in parameter size within the attention block and competitive accuracy improvements of 3.55% and 0.89% over symmetric and pairwise attention-based BERT models, respectively.</abstract>
<identifier type="citekey">kowsher-etal-2025-self</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-industry.44</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-industry.44/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>535</start>
<end>543</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Self-Attention Need Separate Weights in Transformers?
%A Kowsher, Md
%A Prottasha, Nusrat Jahan
%A Yu, Chun-Nam
%A Garibay, Ozlem
%A Yousefi, Niloofar
%Y Chen, Weizhu
%Y Yang, Yi
%Y Kachuee, Mohammad
%Y Fu, Xue-Yong
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-194-0
%F kowsher-etal-2025-self
%X Self-attention has revolutionized natural language processing by capturing long-range dependencies and improving context understanding. However, it comes with high computational costs and struggles with sequential data’s inherent directionality. This paper investigates and presents a simplified approach called “shared weight self-attention,” where a single weight matrix is used for Keys, Queries, and Values instead of separate matrices for each. This approach cuts training parameters by more than half and significantly reduces training time. Our method not only improves efficiency but also achieves strong performance on tasks from the GLUE benchmark, even outperforming the standard BERT baseline in handling noisy and out-of-domain data. Experimental results show a 66.53% reduction in parameter size within the attention block and competitive accuracy improvements of 3.55% and 0.89% over symmetric and pairwise attention-based BERT models, respectively.
%R 10.18653/v1/2025.naacl-industry.44
%U https://aclanthology.org/2025.naacl-industry.44/
%U https://doi.org/10.18653/v1/2025.naacl-industry.44
%P 535-543
Markdown (Informal)
[Does Self-Attention Need Separate Weights in Transformers?](https://aclanthology.org/2025.naacl-industry.44/) (Kowsher et al., NAACL 2025)
ACL
- Md Kowsher, Nusrat Jahan Prottasha, Chun-Nam Yu, Ozlem Garibay, and Niloofar Yousefi. 2025. Does Self-Attention Need Separate Weights in Transformers?. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track), pages 535–543, Albuquerque, New Mexico. Association for Computational Linguistics.