@inproceedings{kosmajac-etal-2020-dnlp,
title = "{DNLP}@{F}in{TOC}{'}20: Table of Contents Detection in Financial Documents",
author = "Kosmajac, Dijana and
Taylor, Stacey and
Saeidi, Mozhgan",
editor = "El-Haj, Dr Mahmoud and
Athanasakou, Dr Vasiliki and
Ferradans, Dr Sira and
Salzedo, Dr Catherine and
Elhag, Dr Ans and
Bouamor, Dr Houda and
Litvak, Dr Marina and
Rayson, Dr Paul and
Giannakopoulos, Dr George and
Pittaras, Nikiforos",
booktitle = "Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "COLING",
url = "https://aclanthology.org/2020.fnp-1.29",
pages = "169--173",
abstract = "Title Detection and Table of Contents Generation are important components in detecting document structure. In particular, these two elements serve to provide the skeleton of the document, providing users with an understanding of organization, as well as the relevance of information, and where to find information within the document. Here, we show that using tesseract with Levenstein distance, a feature set inspired by Alk et al., we were able to correctly classify the title to an F1 measure 0.73 and 0.87, and the table-of-contents to a harmonic mean of 0.36 and 0.39, in English and French respectively. Our methodology works with both PDF and scanned documents, giving it a wide range of applicability within the document engineering and storage domains.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kosmajac-etal-2020-dnlp">
<titleInfo>
<title>DNLP@FinTOC’20: Table of Contents Detection in Financial Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dijana</namePart>
<namePart type="family">Kosmajac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stacey</namePart>
<namePart type="family">Taylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mozhgan</namePart>
<namePart type="family">Saeidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Vasiliki</namePart>
<namePart type="family">Athanasakou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Sira</namePart>
<namePart type="family">Ferradans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Catherine</namePart>
<namePart type="family">Salzedo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Ans</namePart>
<namePart type="family">Elhag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Marina</namePart>
<namePart type="family">Litvak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">George</namePart>
<namePart type="family">Giannakopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikiforos</namePart>
<namePart type="family">Pittaras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>COLING</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Title Detection and Table of Contents Generation are important components in detecting document structure. In particular, these two elements serve to provide the skeleton of the document, providing users with an understanding of organization, as well as the relevance of information, and where to find information within the document. Here, we show that using tesseract with Levenstein distance, a feature set inspired by Alk et al., we were able to correctly classify the title to an F1 measure 0.73 and 0.87, and the table-of-contents to a harmonic mean of 0.36 and 0.39, in English and French respectively. Our methodology works with both PDF and scanned documents, giving it a wide range of applicability within the document engineering and storage domains.</abstract>
<identifier type="citekey">kosmajac-etal-2020-dnlp</identifier>
<location>
<url>https://aclanthology.org/2020.fnp-1.29</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>169</start>
<end>173</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DNLP@FinTOC’20: Table of Contents Detection in Financial Documents
%A Kosmajac, Dijana
%A Taylor, Stacey
%A Saeidi, Mozhgan
%Y El-Haj, Dr Mahmoud
%Y Athanasakou, Dr Vasiliki
%Y Ferradans, Dr Sira
%Y Salzedo, Dr Catherine
%Y Elhag, Dr Ans
%Y Bouamor, Dr Houda
%Y Litvak, Dr Marina
%Y Rayson, Dr Paul
%Y Giannakopoulos, Dr George
%Y Pittaras, Nikiforos
%S Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation
%D 2020
%8 December
%I COLING
%C Barcelona, Spain (Online)
%F kosmajac-etal-2020-dnlp
%X Title Detection and Table of Contents Generation are important components in detecting document structure. In particular, these two elements serve to provide the skeleton of the document, providing users with an understanding of organization, as well as the relevance of information, and where to find information within the document. Here, we show that using tesseract with Levenstein distance, a feature set inspired by Alk et al., we were able to correctly classify the title to an F1 measure 0.73 and 0.87, and the table-of-contents to a harmonic mean of 0.36 and 0.39, in English and French respectively. Our methodology works with both PDF and scanned documents, giving it a wide range of applicability within the document engineering and storage domains.
%U https://aclanthology.org/2020.fnp-1.29
%P 169-173
Markdown (Informal)
[DNLP@FinTOC’20: Table of Contents Detection in Financial Documents](https://aclanthology.org/2020.fnp-1.29) (Kosmajac et al., FNP 2020)
ACL