@inproceedings{leng-etal-2026-token,
title = "Token-Wise Kernels ({TW}i{K}ers) for Vicinity-Aware Attention in Transformers",
author = "Leng, Kuangdai and
Bi, Jia and
Pinilla, Samuel and
Cha, Jaehoon",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.306/",
pages = "5819--5835",
ISBN = "979-8-89176-386-9",
abstract = "Self-attention mechanisms in transformers enable tokens to interact across a sequence but lack an explicit inductive bias to capture local contextual dependencies, an inherent characteristic of natural languages. We propose Token-Wise Kernels (TWiKers), a novel enhancement to transformers that learn token-specific convolutional kernels applied to the keys or values. Each token is assigned a small kernel, initialized to the ``Central Dirac'' (e.g., [0,1,0] for size=3), meaning the token ``bears'' the attention from all other tokens alone. During training, these kernels adapt, and greater deviation from the Central Dirac indicates stronger attention redistribution to neighboring tokens. This introduces the first transformer weights with direct semantic interpretability. Our experiments show that content words (e.g., nouns and verbs) retain self-focus, while function words (e.g., prepositions and conjunctions) shift attention toward their neighbors, aligning with their syntactic and semantic roles. We further apply TWiKers to distinguish literary genres, historical periods, and authors, demonstrating their effectiveness in capturing high-level stylistic patterns. Finally, we demonstrate the potential of TWiKers as an effective inductive bias to improve transformer training, validated across a range of downstream tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leng-etal-2026-token">
<titleInfo>
<title>Token-Wise Kernels (TWiKers) for Vicinity-Aware Attention in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kuangdai</namePart>
<namePart type="family">Leng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jia</namePart>
<namePart type="family">Bi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Pinilla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaehoon</namePart>
<namePart type="family">Cha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Self-attention mechanisms in transformers enable tokens to interact across a sequence but lack an explicit inductive bias to capture local contextual dependencies, an inherent characteristic of natural languages. We propose Token-Wise Kernels (TWiKers), a novel enhancement to transformers that learn token-specific convolutional kernels applied to the keys or values. Each token is assigned a small kernel, initialized to the “Central Dirac” (e.g., [0,1,0] for size=3), meaning the token “bears” the attention from all other tokens alone. During training, these kernels adapt, and greater deviation from the Central Dirac indicates stronger attention redistribution to neighboring tokens. This introduces the first transformer weights with direct semantic interpretability. Our experiments show that content words (e.g., nouns and verbs) retain self-focus, while function words (e.g., prepositions and conjunctions) shift attention toward their neighbors, aligning with their syntactic and semantic roles. We further apply TWiKers to distinguish literary genres, historical periods, and authors, demonstrating their effectiveness in capturing high-level stylistic patterns. Finally, we demonstrate the potential of TWiKers as an effective inductive bias to improve transformer training, validated across a range of downstream tasks.</abstract>
<identifier type="citekey">leng-etal-2026-token</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.306/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5819</start>
<end>5835</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Token-Wise Kernels (TWiKers) for Vicinity-Aware Attention in Transformers
%A Leng, Kuangdai
%A Bi, Jia
%A Pinilla, Samuel
%A Cha, Jaehoon
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F leng-etal-2026-token
%X Self-attention mechanisms in transformers enable tokens to interact across a sequence but lack an explicit inductive bias to capture local contextual dependencies, an inherent characteristic of natural languages. We propose Token-Wise Kernels (TWiKers), a novel enhancement to transformers that learn token-specific convolutional kernels applied to the keys or values. Each token is assigned a small kernel, initialized to the “Central Dirac” (e.g., [0,1,0] for size=3), meaning the token “bears” the attention from all other tokens alone. During training, these kernels adapt, and greater deviation from the Central Dirac indicates stronger attention redistribution to neighboring tokens. This introduces the first transformer weights with direct semantic interpretability. Our experiments show that content words (e.g., nouns and verbs) retain self-focus, while function words (e.g., prepositions and conjunctions) shift attention toward their neighbors, aligning with their syntactic and semantic roles. We further apply TWiKers to distinguish literary genres, historical periods, and authors, demonstrating their effectiveness in capturing high-level stylistic patterns. Finally, we demonstrate the potential of TWiKers as an effective inductive bias to improve transformer training, validated across a range of downstream tasks.
%U https://aclanthology.org/2026.findings-eacl.306/
%P 5819-5835
Markdown (Informal)
[Token-Wise Kernels (TWiKers) for Vicinity-Aware Attention in Transformers](https://aclanthology.org/2026.findings-eacl.306/) (Leng et al., Findings 2026)
ACL