@inproceedings{musacchio-etal-2026-mind,
title = "Mind Your Special Tokens! On the Importance of Dedicated Sequence-End Tokens in Vision-Language Embedding Models",
author = "Musacchio, Elio and
Semeraro, Giovanni and
Glava{\v{s}}, Goran",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-short.24/",
pages = "322--328",
ISBN = "979-8-89176-381-4",
abstract = "Large Vision-Language Models (LVLMs), trained by aligning visual encoders to LLMs on extensive vision-language data, demonstrate impressive performance across a broad variety of tasks that require understanding of both visual and textual inputs. Acknowledging this, recent work proposed to post-hoc convert generative LVLMs into vision-language encoders (VLEs) via supervised contrastive learning objectives. This type of training enables LVLMs to produce better representations, i.e., embeddings for image and text input, used in retrieval and (semantic) similarity tasks. Having observed that this type of VLEs (i.e., LVLMs turned into encoders) commonly employ last-token pooling in downstream tasks, without using special sequence-end tokens, in this focused contribution, we study the effect of pooling strategies on VLEs' downstream performance. We empirically show that, in contrast to mean pooling, last-token pooling (without special sequence-end tokens) makes VLEs highly sensitive to end-of-input artifacts in fine-tuning and inference data, e.g., whether input sequences end with punctuation or newline characters. Finally, we show that introducing the special end-of-sequence token removes this sensitivity and makes VLEs robust to formatting artifacts of input text."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="musacchio-etal-2026-mind">
<titleInfo>
<title>Mind Your Special Tokens! On the Importance of Dedicated Sequence-End Tokens in Vision-Language Embedding Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elio</namePart>
<namePart type="family">Musacchio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Semeraro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Glavaš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-381-4</identifier>
</relatedItem>
<abstract>Large Vision-Language Models (LVLMs), trained by aligning visual encoders to LLMs on extensive vision-language data, demonstrate impressive performance across a broad variety of tasks that require understanding of both visual and textual inputs. Acknowledging this, recent work proposed to post-hoc convert generative LVLMs into vision-language encoders (VLEs) via supervised contrastive learning objectives. This type of training enables LVLMs to produce better representations, i.e., embeddings for image and text input, used in retrieval and (semantic) similarity tasks. Having observed that this type of VLEs (i.e., LVLMs turned into encoders) commonly employ last-token pooling in downstream tasks, without using special sequence-end tokens, in this focused contribution, we study the effect of pooling strategies on VLEs’ downstream performance. We empirically show that, in contrast to mean pooling, last-token pooling (without special sequence-end tokens) makes VLEs highly sensitive to end-of-input artifacts in fine-tuning and inference data, e.g., whether input sequences end with punctuation or newline characters. Finally, we show that introducing the special end-of-sequence token removes this sensitivity and makes VLEs robust to formatting artifacts of input text.</abstract>
<identifier type="citekey">musacchio-etal-2026-mind</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-short.24/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>322</start>
<end>328</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mind Your Special Tokens! On the Importance of Dedicated Sequence-End Tokens in Vision-Language Embedding Models
%A Musacchio, Elio
%A Semeraro, Giovanni
%A Glavaš, Goran
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-381-4
%F musacchio-etal-2026-mind
%X Large Vision-Language Models (LVLMs), trained by aligning visual encoders to LLMs on extensive vision-language data, demonstrate impressive performance across a broad variety of tasks that require understanding of both visual and textual inputs. Acknowledging this, recent work proposed to post-hoc convert generative LVLMs into vision-language encoders (VLEs) via supervised contrastive learning objectives. This type of training enables LVLMs to produce better representations, i.e., embeddings for image and text input, used in retrieval and (semantic) similarity tasks. Having observed that this type of VLEs (i.e., LVLMs turned into encoders) commonly employ last-token pooling in downstream tasks, without using special sequence-end tokens, in this focused contribution, we study the effect of pooling strategies on VLEs’ downstream performance. We empirically show that, in contrast to mean pooling, last-token pooling (without special sequence-end tokens) makes VLEs highly sensitive to end-of-input artifacts in fine-tuning and inference data, e.g., whether input sequences end with punctuation or newline characters. Finally, we show that introducing the special end-of-sequence token removes this sensitivity and makes VLEs robust to formatting artifacts of input text.
%U https://aclanthology.org/2026.eacl-short.24/
%P 322-328
Markdown (Informal)
[Mind Your Special Tokens! On the Importance of Dedicated Sequence-End Tokens in Vision-Language Embedding Models](https://aclanthology.org/2026.eacl-short.24/) (Musacchio et al., EACL 2026)
ACL