@inproceedings{kamoda-etal-2025-weight,
title = "Weight-based Analysis of Detokenization in Language Models: Understanding the First Stage of Inference Without Inference",
author = "Kamoda, Go and
Heinzerling, Benjamin and
Inaba, Tatsuro and
Kudo, Keito and
Sakaguchi, Keisuke and
Inui, Kentaro",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.355/",
doi = "10.18653/v1/2025.findings-naacl.355",
pages = "6324--6343",
ISBN = "979-8-89176-195-7",
abstract = "According to the stages-of-inference hypothesis, early layers of language models map their subword-tokenized input, which does not necessarily correspond to a linguistically meaningful segmentation, to more meaningful representations that form the model{'}s ``inner vocabulary''.Prior analysis of this *detokenization* stage has predominantly relied on probing and interventions such as path patching, which involve selecting particular inputs, choosing a subset of components that will be patched, and then observing changes in model behavior.Here, we show that several important aspects of the detokenization stage can be understood purely by analyzing model weights, without performing any model inference steps.Specifically, we introduce an analytical decomposition of first-layer attention in GPT-2.Our decomposition yields interpretable terms that quantify the relative contributions of position-related, token-related, and mixed effects.By focusing on terms in this decomposition, we discover weight-based explanations of attention bias toward close tokens and attention for detokenization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kamoda-etal-2025-weight">
<titleInfo>
<title>Weight-based Analysis of Detokenization in Language Models: Understanding the First Stage of Inference Without Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Kamoda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Heinzerling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuro</namePart>
<namePart type="family">Inaba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keito</namePart>
<namePart type="family">Kudo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keisuke</namePart>
<namePart type="family">Sakaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>According to the stages-of-inference hypothesis, early layers of language models map their subword-tokenized input, which does not necessarily correspond to a linguistically meaningful segmentation, to more meaningful representations that form the model’s “inner vocabulary”.Prior analysis of this *detokenization* stage has predominantly relied on probing and interventions such as path patching, which involve selecting particular inputs, choosing a subset of components that will be patched, and then observing changes in model behavior.Here, we show that several important aspects of the detokenization stage can be understood purely by analyzing model weights, without performing any model inference steps.Specifically, we introduce an analytical decomposition of first-layer attention in GPT-2.Our decomposition yields interpretable terms that quantify the relative contributions of position-related, token-related, and mixed effects.By focusing on terms in this decomposition, we discover weight-based explanations of attention bias toward close tokens and attention for detokenization.</abstract>
<identifier type="citekey">kamoda-etal-2025-weight</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.355</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.355/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6324</start>
<end>6343</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Weight-based Analysis of Detokenization in Language Models: Understanding the First Stage of Inference Without Inference
%A Kamoda, Go
%A Heinzerling, Benjamin
%A Inaba, Tatsuro
%A Kudo, Keito
%A Sakaguchi, Keisuke
%A Inui, Kentaro
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F kamoda-etal-2025-weight
%X According to the stages-of-inference hypothesis, early layers of language models map their subword-tokenized input, which does not necessarily correspond to a linguistically meaningful segmentation, to more meaningful representations that form the model’s “inner vocabulary”.Prior analysis of this *detokenization* stage has predominantly relied on probing and interventions such as path patching, which involve selecting particular inputs, choosing a subset of components that will be patched, and then observing changes in model behavior.Here, we show that several important aspects of the detokenization stage can be understood purely by analyzing model weights, without performing any model inference steps.Specifically, we introduce an analytical decomposition of first-layer attention in GPT-2.Our decomposition yields interpretable terms that quantify the relative contributions of position-related, token-related, and mixed effects.By focusing on terms in this decomposition, we discover weight-based explanations of attention bias toward close tokens and attention for detokenization.
%R 10.18653/v1/2025.findings-naacl.355
%U https://aclanthology.org/2025.findings-naacl.355/
%U https://doi.org/10.18653/v1/2025.findings-naacl.355
%P 6324-6343
Markdown (Informal)
[Weight-based Analysis of Detokenization in Language Models: Understanding the First Stage of Inference Without Inference](https://aclanthology.org/2025.findings-naacl.355/) (Kamoda et al., Findings 2025)
ACL