@inproceedings{coelho-etal-2024-dwell,
title = "Dwell in the Beginning: How Language Models Embed Long Documents for Dense Retrieval",
author = "Coelho, Jo{\~a}o and
Martins, Bruno and
Magalhaes, Joao and
Callan, Jamie and
Xiong, Chenyan",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-short.35/",
doi = "10.18653/v1/2024.acl-short.35",
pages = "370--377",
abstract = "This study investigates the existence of positional biases in Transformer-based language models for text representation learning, particularly in the context of web document retrieval. We build on previous research that demonstrated loss of information in the middle of input sequences for causal language models, extending it to the domain of embedding learning. We examine positional biases at multiple stages of the training pipeline for an encoder-decoder neural retrieval model, namely language model pre-training, contrastive pre-training, and contrastive fine-tuning. Experiments with the MS-MARCO document collection reveal that after contrastive pre-training the model already generates embeddings that better capture the beginning of the input content, with fine-tuning further aggravating this effect."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="coelho-etal-2024-dwell">
<titleInfo>
<title>Dwell in the Beginning: How Language Models Embed Long Documents for Dense Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Coelho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bruno</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joao</namePart>
<namePart type="family">Magalhaes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jamie</namePart>
<namePart type="family">Callan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenyan</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study investigates the existence of positional biases in Transformer-based language models for text representation learning, particularly in the context of web document retrieval. We build on previous research that demonstrated loss of information in the middle of input sequences for causal language models, extending it to the domain of embedding learning. We examine positional biases at multiple stages of the training pipeline for an encoder-decoder neural retrieval model, namely language model pre-training, contrastive pre-training, and contrastive fine-tuning. Experiments with the MS-MARCO document collection reveal that after contrastive pre-training the model already generates embeddings that better capture the beginning of the input content, with fine-tuning further aggravating this effect.</abstract>
<identifier type="citekey">coelho-etal-2024-dwell</identifier>
<identifier type="doi">10.18653/v1/2024.acl-short.35</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-short.35/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>370</start>
<end>377</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dwell in the Beginning: How Language Models Embed Long Documents for Dense Retrieval
%A Coelho, João
%A Martins, Bruno
%A Magalhaes, Joao
%A Callan, Jamie
%A Xiong, Chenyan
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F coelho-etal-2024-dwell
%X This study investigates the existence of positional biases in Transformer-based language models for text representation learning, particularly in the context of web document retrieval. We build on previous research that demonstrated loss of information in the middle of input sequences for causal language models, extending it to the domain of embedding learning. We examine positional biases at multiple stages of the training pipeline for an encoder-decoder neural retrieval model, namely language model pre-training, contrastive pre-training, and contrastive fine-tuning. Experiments with the MS-MARCO document collection reveal that after contrastive pre-training the model already generates embeddings that better capture the beginning of the input content, with fine-tuning further aggravating this effect.
%R 10.18653/v1/2024.acl-short.35
%U https://aclanthology.org/2024.luhme-short.35/
%U https://doi.org/10.18653/v1/2024.acl-short.35
%P 370-377
Markdown (Informal)
[Dwell in the Beginning: How Language Models Embed Long Documents for Dense Retrieval](https://aclanthology.org/2024.luhme-short.35/) (Coelho et al., ACL 2024)
ACL