@inproceedings{masson-paroubek-2024-evaluating,
title = "Evaluating Topic Model on Asymmetric and Multi-Domain Financial Corpus",
author = "Masson, Corentin and
Paroubek, Patrick",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.578",
pages = "6515--6529",
abstract = "Multiple recent research works in Finance try to quantify the exposure of market assets to various risks from text and how assets react if the risk materialize itself. We consider risk sections from french Financial Corporate Annual Reports, which are regulated documents with a mandatory section containing important risks the company is facing, to extract an accurate risk profile and exposure of companies. We identify multiple pitfalls of topic models when applied to corporate filing financial domain data for unsupervised risk distribution extraction which has not yet been studied on this domain. We propose two new metrics to evaluate the behavior of different types of topic models with respect to pitfalls previously mentioned about document risk distribution extraction. Our evaluation will focus on three aspects: regularizations, down-sampling and data augmentation. In our experiments, we found that classic Topic Models require down-sampling to obtain unbiased risks, while Topic Models using metadata and in-domain pre-trained word-embeddings partially correct the coherence imbalance per subdomain and remove sector{'}s specific language from the detected themes. We then demonstrate the relevance and usefulness of the extracted information with visualizations that help to understand the content of such corpus and its evolution along the years.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="masson-paroubek-2024-evaluating">
<titleInfo>
<title>Evaluating Topic Model on Asymmetric and Multi-Domain Financial Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Corentin</namePart>
<namePart type="family">Masson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Paroubek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multiple recent research works in Finance try to quantify the exposure of market assets to various risks from text and how assets react if the risk materialize itself. We consider risk sections from french Financial Corporate Annual Reports, which are regulated documents with a mandatory section containing important risks the company is facing, to extract an accurate risk profile and exposure of companies. We identify multiple pitfalls of topic models when applied to corporate filing financial domain data for unsupervised risk distribution extraction which has not yet been studied on this domain. We propose two new metrics to evaluate the behavior of different types of topic models with respect to pitfalls previously mentioned about document risk distribution extraction. Our evaluation will focus on three aspects: regularizations, down-sampling and data augmentation. In our experiments, we found that classic Topic Models require down-sampling to obtain unbiased risks, while Topic Models using metadata and in-domain pre-trained word-embeddings partially correct the coherence imbalance per subdomain and remove sector’s specific language from the detected themes. We then demonstrate the relevance and usefulness of the extracted information with visualizations that help to understand the content of such corpus and its evolution along the years.</abstract>
<identifier type="citekey">masson-paroubek-2024-evaluating</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.578</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>6515</start>
<end>6529</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Topic Model on Asymmetric and Multi-Domain Financial Corpus
%A Masson, Corentin
%A Paroubek, Patrick
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F masson-paroubek-2024-evaluating
%X Multiple recent research works in Finance try to quantify the exposure of market assets to various risks from text and how assets react if the risk materialize itself. We consider risk sections from french Financial Corporate Annual Reports, which are regulated documents with a mandatory section containing important risks the company is facing, to extract an accurate risk profile and exposure of companies. We identify multiple pitfalls of topic models when applied to corporate filing financial domain data for unsupervised risk distribution extraction which has not yet been studied on this domain. We propose two new metrics to evaluate the behavior of different types of topic models with respect to pitfalls previously mentioned about document risk distribution extraction. Our evaluation will focus on three aspects: regularizations, down-sampling and data augmentation. In our experiments, we found that classic Topic Models require down-sampling to obtain unbiased risks, while Topic Models using metadata and in-domain pre-trained word-embeddings partially correct the coherence imbalance per subdomain and remove sector’s specific language from the detected themes. We then demonstrate the relevance and usefulness of the extracted information with visualizations that help to understand the content of such corpus and its evolution along the years.
%U https://aclanthology.org/2024.lrec-main.578
%P 6515-6529
Markdown (Informal)
[Evaluating Topic Model on Asymmetric and Multi-Domain Financial Corpus](https://aclanthology.org/2024.lrec-main.578) (Masson & Paroubek, LREC-COLING 2024)
ACL