@inproceedings{loukas-etal-2021-edgar,
title = "{EDGAR}-{CORPUS}: Billions of Tokens Make The World Go Round",
author = "Loukas, Lefteris and
Fergadiotis, Manos and
Androutsopoulos, Ion and
Malakasiotis, Prodromos",
editor = "Hahn, Udo and
Hoste, Veronique and
Stent, Amanda",
booktitle = "Proceedings of the Third Workshop on Economics and Natural Language Processing",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.econlp-1.2",
doi = "10.18653/v1/2021.econlp-1.2",
pages = "13--18",
abstract = "We release EDGAR-CORPUS, a novel corpus comprising annual reports from all the publicly traded companies in the US spanning a period of more than 25 years. To the best of our knowledge, EDGAR-CORPUS is the largest financial NLP corpus available to date. All the reports are downloaded, split into their corresponding items (sections), and provided in a clean, easy-to-use JSON format. We use EDGAR-CORPUS to train and release EDGAR-W2V, which are WORD2VEC embeddings for the financial domain. We employ these embeddings in a battery of financial NLP tasks and showcase their superiority over generic GloVe embeddings and other existing financial word embeddings. We also open-source EDGAR-CRAWLER, a toolkit that facilitates downloading and extracting future annual reports.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="loukas-etal-2021-edgar">
<titleInfo>
<title>EDGAR-CORPUS: Billions of Tokens Make The World Go Round</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lefteris</namePart>
<namePart type="family">Loukas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manos</namePart>
<namePart type="family">Fergadiotis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ion</namePart>
<namePart type="family">Androutsopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prodromos</namePart>
<namePart type="family">Malakasiotis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Economics and Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Udo</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="family">Stent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We release EDGAR-CORPUS, a novel corpus comprising annual reports from all the publicly traded companies in the US spanning a period of more than 25 years. To the best of our knowledge, EDGAR-CORPUS is the largest financial NLP corpus available to date. All the reports are downloaded, split into their corresponding items (sections), and provided in a clean, easy-to-use JSON format. We use EDGAR-CORPUS to train and release EDGAR-W2V, which are WORD2VEC embeddings for the financial domain. We employ these embeddings in a battery of financial NLP tasks and showcase their superiority over generic GloVe embeddings and other existing financial word embeddings. We also open-source EDGAR-CRAWLER, a toolkit that facilitates downloading and extracting future annual reports.</abstract>
<identifier type="citekey">loukas-etal-2021-edgar</identifier>
<identifier type="doi">10.18653/v1/2021.econlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2021.econlp-1.2</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>13</start>
<end>18</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EDGAR-CORPUS: Billions of Tokens Make The World Go Round
%A Loukas, Lefteris
%A Fergadiotis, Manos
%A Androutsopoulos, Ion
%A Malakasiotis, Prodromos
%Y Hahn, Udo
%Y Hoste, Veronique
%Y Stent, Amanda
%S Proceedings of the Third Workshop on Economics and Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F loukas-etal-2021-edgar
%X We release EDGAR-CORPUS, a novel corpus comprising annual reports from all the publicly traded companies in the US spanning a period of more than 25 years. To the best of our knowledge, EDGAR-CORPUS is the largest financial NLP corpus available to date. All the reports are downloaded, split into their corresponding items (sections), and provided in a clean, easy-to-use JSON format. We use EDGAR-CORPUS to train and release EDGAR-W2V, which are WORD2VEC embeddings for the financial domain. We employ these embeddings in a battery of financial NLP tasks and showcase their superiority over generic GloVe embeddings and other existing financial word embeddings. We also open-source EDGAR-CRAWLER, a toolkit that facilitates downloading and extracting future annual reports.
%R 10.18653/v1/2021.econlp-1.2
%U https://aclanthology.org/2021.econlp-1.2
%U https://doi.org/10.18653/v1/2021.econlp-1.2
%P 13-18
Markdown (Informal)
[EDGAR-CORPUS: Billions of Tokens Make The World Go Round](https://aclanthology.org/2021.econlp-1.2) (Loukas et al., ECONLP 2021)
ACL
- Lefteris Loukas, Manos Fergadiotis, Ion Androutsopoulos, and Prodromos Malakasiotis. 2021. EDGAR-CORPUS: Billions of Tokens Make The World Go Round. In Proceedings of the Third Workshop on Economics and Natural Language Processing, pages 13–18, Punta Cana, Dominican Republic. Association for Computational Linguistics.