@inproceedings{anderson-etal-2024-greenback,
title = "Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt",
author = "Anderson, Peter and
Janardhanan, Mano Vikash and
He, Jason and
Cheng, Wei and
Flanagan, Charlie",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.26",
doi = "10.18653/v1/2024.emnlp-industry.26",
pages = "362--370",
abstract = "Financial documents are filled with specialized terminology, arcane jargon, and curious acronyms that pose challenges for general-purpose text embeddings. Yet, few text embeddings specialized for finance have been reported in the literature, perhaps in part due to a lack of public datasets and benchmarks. We present BAM embeddings, a set of text embeddings finetuned on a carefully constructed dataset of 14.3M query-passage pairs including both public and proprietary financial documents. Demonstrating the benefits of domain-specific training, BAM embeddings achieve Recall@1 of 62.8{\%} on a held-out test set, vs. only 39.2{\%} for the best general-purpose text embedding from OpenAI. Further, BAM embeddings increase question answering accuracy by 8{\%} on FinanceBench and show increased sensitivity to the finance-specific elements that are found in detailed, forward-looking and company and date-specific queries. To support further research we describe our approach in detail, quantify the importance of hard negative mining and dataset scale, and publicly release our embeddings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anderson-etal-2024-greenback">
<titleInfo>
<title>Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mano</namePart>
<namePart type="given">Vikash</namePart>
<namePart type="family">Janardhanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlie</namePart>
<namePart type="family">Flanagan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Financial documents are filled with specialized terminology, arcane jargon, and curious acronyms that pose challenges for general-purpose text embeddings. Yet, few text embeddings specialized for finance have been reported in the literature, perhaps in part due to a lack of public datasets and benchmarks. We present BAM embeddings, a set of text embeddings finetuned on a carefully constructed dataset of 14.3M query-passage pairs including both public and proprietary financial documents. Demonstrating the benefits of domain-specific training, BAM embeddings achieve Recall@1 of 62.8% on a held-out test set, vs. only 39.2% for the best general-purpose text embedding from OpenAI. Further, BAM embeddings increase question answering accuracy by 8% on FinanceBench and show increased sensitivity to the finance-specific elements that are found in detailed, forward-looking and company and date-specific queries. To support further research we describe our approach in detail, quantify the importance of hard negative mining and dataset scale, and publicly release our embeddings.</abstract>
<identifier type="citekey">anderson-etal-2024-greenback</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-industry.26</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.26</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>362</start>
<end>370</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt
%A Anderson, Peter
%A Janardhanan, Mano Vikash
%A He, Jason
%A Cheng, Wei
%A Flanagan, Charlie
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F anderson-etal-2024-greenback
%X Financial documents are filled with specialized terminology, arcane jargon, and curious acronyms that pose challenges for general-purpose text embeddings. Yet, few text embeddings specialized for finance have been reported in the literature, perhaps in part due to a lack of public datasets and benchmarks. We present BAM embeddings, a set of text embeddings finetuned on a carefully constructed dataset of 14.3M query-passage pairs including both public and proprietary financial documents. Demonstrating the benefits of domain-specific training, BAM embeddings achieve Recall@1 of 62.8% on a held-out test set, vs. only 39.2% for the best general-purpose text embedding from OpenAI. Further, BAM embeddings increase question answering accuracy by 8% on FinanceBench and show increased sensitivity to the finance-specific elements that are found in detailed, forward-looking and company and date-specific queries. To support further research we describe our approach in detail, quantify the importance of hard negative mining and dataset scale, and publicly release our embeddings.
%R 10.18653/v1/2024.emnlp-industry.26
%U https://aclanthology.org/2024.emnlp-industry.26
%U https://doi.org/10.18653/v1/2024.emnlp-industry.26
%P 362-370
Markdown (Informal)
[Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt](https://aclanthology.org/2024.emnlp-industry.26) (Anderson et al., EMNLP 2024)
ACL