@inproceedings{hosseini-etal-2023-bert,
title = "{BERT} Has More to Offer: {BERT} Layers Combination Yields Better Sentence Embeddings",
author = "Hosseini, MohammadSaleh and
Munia, Munawara and
Khan, Latifur",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.1030/",
doi = "10.18653/v1/2023.findings-emnlp.1030",
pages = "15419--15431",
abstract = "Obtaining sentence representations from BERT-based models as feature extractors is invaluable as it takes much less time to pre-compute a one-time representation of the data and then use it for the downstream tasks, rather than fine-tune the whole BERT. Most previous works acquire a sentence`s representation by passing it to BERT and averaging its last layer. In this paper, we propose that the combination of certain layers of a BERT-based model rested on the data set and model can achieve substantially better results. We empirically show the effectiveness of our method for different BERT-based models on different tasks and data sets. Specifically, on seven standard semantic textual similarity data sets, we outperform the baseline BERT by improving the Spearman`s correlation by up to 25.75{\%} and on average 16.32{\%} without any further training. We also achieved state-of-the-art results on eight transfer data sets by reducing the relative error by up to 37.41{\%} and on average 17.92{\%}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hosseini-etal-2023-bert">
<titleInfo>
<title>BERT Has More to Offer: BERT Layers Combination Yields Better Sentence Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">MohammadSaleh</namePart>
<namePart type="family">Hosseini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Munawara</namePart>
<namePart type="family">Munia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Latifur</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Obtaining sentence representations from BERT-based models as feature extractors is invaluable as it takes much less time to pre-compute a one-time representation of the data and then use it for the downstream tasks, rather than fine-tune the whole BERT. Most previous works acquire a sentence‘s representation by passing it to BERT and averaging its last layer. In this paper, we propose that the combination of certain layers of a BERT-based model rested on the data set and model can achieve substantially better results. We empirically show the effectiveness of our method for different BERT-based models on different tasks and data sets. Specifically, on seven standard semantic textual similarity data sets, we outperform the baseline BERT by improving the Spearman‘s correlation by up to 25.75% and on average 16.32% without any further training. We also achieved state-of-the-art results on eight transfer data sets by reducing the relative error by up to 37.41% and on average 17.92%.</abstract>
<identifier type="citekey">hosseini-etal-2023-bert</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.1030</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.1030/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>15419</start>
<end>15431</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERT Has More to Offer: BERT Layers Combination Yields Better Sentence Embeddings
%A Hosseini, MohammadSaleh
%A Munia, Munawara
%A Khan, Latifur
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F hosseini-etal-2023-bert
%X Obtaining sentence representations from BERT-based models as feature extractors is invaluable as it takes much less time to pre-compute a one-time representation of the data and then use it for the downstream tasks, rather than fine-tune the whole BERT. Most previous works acquire a sentence‘s representation by passing it to BERT and averaging its last layer. In this paper, we propose that the combination of certain layers of a BERT-based model rested on the data set and model can achieve substantially better results. We empirically show the effectiveness of our method for different BERT-based models on different tasks and data sets. Specifically, on seven standard semantic textual similarity data sets, we outperform the baseline BERT by improving the Spearman‘s correlation by up to 25.75% and on average 16.32% without any further training. We also achieved state-of-the-art results on eight transfer data sets by reducing the relative error by up to 37.41% and on average 17.92%.
%R 10.18653/v1/2023.findings-emnlp.1030
%U https://aclanthology.org/2023.findings-emnlp.1030/
%U https://doi.org/10.18653/v1/2023.findings-emnlp.1030
%P 15419-15431
Markdown (Informal)
[BERT Has More to Offer: BERT Layers Combination Yields Better Sentence Embeddings](https://aclanthology.org/2023.findings-emnlp.1030/) (Hosseini et al., Findings 2023)
ACL