@inproceedings{hasan-2026-fragile,
title = "How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization",
author = "Hasan, Mujtaba",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.alvr-main.27/",
pages = "278--286",
ISBN = "979-8-89176-398-2",
abstract = "Text-to-image diffusion models learn a mapping from natural language to visual structure, but how robust is this mapping to perturbation? We use personalization{---}fine-tuning a model to learn a new face, object, or style{---}as a controlled stress test to probe the fragility of learned vision-language alignment. We find that fine-tuning for one concept systematically shifts the model{'}s ability to faithfully render unrelated concepts, and that this disruption follows structured, predictable patterns. To measure this fragility, we construct Concept Entanglement Maps: per-prompt, per-model disruption matrices that reveal which concepts are most affected and why. Using Stable Diffusion v1.5 as a controlled testbed, we evaluate 15 subjects across three personalization methods on 200 prompts and report three findings about the organization of vision-language alignment: (1) aggregate disruption is larger for vision-backbone and cross-attention perturbations than for text-embedding perturbations, despite the latter directly modifying the language representation; (2) abstract and compositional language is significantly more fragile than concrete, object-specific language; and (3) disruption does not follow semantic proximity{---}personalizing for a face does not preferentially disrupt other face-related prompts ($p = 1.0$), suggesting that alignment vulnerability is organized globally rather than purely by semantic category. These findings expose a structural vulnerability in current text-to-image personalization: the same cross-attention mechanism that enables compositional generalization also creates pathways through which local fine-tuning can propagate as global alignment shift."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hasan-2026-fragile">
<titleInfo>
<title>How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mujtaba</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qianqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alane</namePart>
<namePart type="family">Suhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="given">Eric</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-398-2</identifier>
</relatedItem>
<abstract>Text-to-image diffusion models learn a mapping from natural language to visual structure, but how robust is this mapping to perturbation? We use personalization—fine-tuning a model to learn a new face, object, or style—as a controlled stress test to probe the fragility of learned vision-language alignment. We find that fine-tuning for one concept systematically shifts the model’s ability to faithfully render unrelated concepts, and that this disruption follows structured, predictable patterns. To measure this fragility, we construct Concept Entanglement Maps: per-prompt, per-model disruption matrices that reveal which concepts are most affected and why. Using Stable Diffusion v1.5 as a controlled testbed, we evaluate 15 subjects across three personalization methods on 200 prompts and report three findings about the organization of vision-language alignment: (1) aggregate disruption is larger for vision-backbone and cross-attention perturbations than for text-embedding perturbations, despite the latter directly modifying the language representation; (2) abstract and compositional language is significantly more fragile than concrete, object-specific language; and (3) disruption does not follow semantic proximity—personalizing for a face does not preferentially disrupt other face-related prompts (p = 1.0), suggesting that alignment vulnerability is organized globally rather than purely by semantic category. These findings expose a structural vulnerability in current text-to-image personalization: the same cross-attention mechanism that enables compositional generalization also creates pathways through which local fine-tuning can propagate as global alignment shift.</abstract>
<identifier type="citekey">hasan-2026-fragile</identifier>
<location>
<url>https://aclanthology.org/2026.alvr-main.27/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>278</start>
<end>286</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization
%A Hasan, Mujtaba
%Y Yan, Qianqi
%Y Montariol, Syrielle
%Y Fan, Yue
%Y Gu, Jing
%Y Pan, Jiayi
%Y Li, Manling
%Y Kordjamshidi, Parisa
%Y Suhr, Alane
%Y Wang, Xin Eric
%S Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-398-2
%F hasan-2026-fragile
%X Text-to-image diffusion models learn a mapping from natural language to visual structure, but how robust is this mapping to perturbation? We use personalization—fine-tuning a model to learn a new face, object, or style—as a controlled stress test to probe the fragility of learned vision-language alignment. We find that fine-tuning for one concept systematically shifts the model’s ability to faithfully render unrelated concepts, and that this disruption follows structured, predictable patterns. To measure this fragility, we construct Concept Entanglement Maps: per-prompt, per-model disruption matrices that reveal which concepts are most affected and why. Using Stable Diffusion v1.5 as a controlled testbed, we evaluate 15 subjects across three personalization methods on 200 prompts and report three findings about the organization of vision-language alignment: (1) aggregate disruption is larger for vision-backbone and cross-attention perturbations than for text-embedding perturbations, despite the latter directly modifying the language representation; (2) abstract and compositional language is significantly more fragile than concrete, object-specific language; and (3) disruption does not follow semantic proximity—personalizing for a face does not preferentially disrupt other face-related prompts (p = 1.0), suggesting that alignment vulnerability is organized globally rather than purely by semantic category. These findings expose a structural vulnerability in current text-to-image personalization: the same cross-attention mechanism that enables compositional generalization also creates pathways through which local fine-tuning can propagate as global alignment shift.
%U https://aclanthology.org/2026.alvr-main.27/
%P 278-286
Markdown (Informal)
[How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization](https://aclanthology.org/2026.alvr-main.27/) (Hasan, ALVR 2026)
ACL