@inproceedings{toker-etal-2025-padding,
title = "Padding Tone: A Mechanistic Analysis of Padding Tokens in {T}2{I} Models",
author = "Toker, Michael and
Galil, Ido and
Orgad, Hadas and
Gal, Rinon and
Tewel, Yoad and
Chechik, Gal and
Belinkov, Yonatan",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.389/",
doi = "10.18653/v1/2025.naacl-long.389",
pages = "7618--7632",
ISBN = "979-8-89176-189-6",
abstract = "Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by appending padding tokens to the input. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model{'}s output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model{'}s architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toker-etal-2025-padding">
<titleInfo>
<title>Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Toker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ido</namePart>
<namePart type="family">Galil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadas</namePart>
<namePart type="family">Orgad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rinon</namePart>
<namePart type="family">Gal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoad</namePart>
<namePart type="family">Tewel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gal</namePart>
<namePart type="family">Chechik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by appending padding tokens to the input. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model’s output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model’s architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.</abstract>
<identifier type="citekey">toker-etal-2025-padding</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.389</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.389/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7618</start>
<end>7632</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models
%A Toker, Michael
%A Galil, Ido
%A Orgad, Hadas
%A Gal, Rinon
%A Tewel, Yoad
%A Chechik, Gal
%A Belinkov, Yonatan
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F toker-etal-2025-padding
%X Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by appending padding tokens to the input. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model’s output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model’s architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.
%R 10.18653/v1/2025.naacl-long.389
%U https://aclanthology.org/2025.naacl-long.389/
%U https://doi.org/10.18653/v1/2025.naacl-long.389
%P 7618-7632
Markdown (Informal)
[Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models](https://aclanthology.org/2025.naacl-long.389/) (Toker et al., NAACL 2025)
ACL
- Michael Toker, Ido Galil, Hadas Orgad, Rinon Gal, Yoad Tewel, Gal Chechik, and Yonatan Belinkov. 2025. Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7618–7632, Albuquerque, New Mexico. Association for Computational Linguistics.