@inproceedings{haase-kirchhoff-2020-taxy,
title = "Taxy.io@{F}in{TOC}-2020: Multilingual Document Structure Extraction using Transfer Learning",
author = "Haase, Frederic and
Kirchhoff, Steffen",
editor = "El-Haj, Dr Mahmoud and
Athanasakou, Dr Vasiliki and
Ferradans, Dr Sira and
Salzedo, Dr Catherine and
Elhag, Dr Ans and
Bouamor, Dr Houda and
Litvak, Dr Marina and
Rayson, Dr Paul and
Giannakopoulos, Dr George and
Pittaras, Nikiforos",
booktitle = "Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "COLING",
url = "https://aclanthology.org/2020.fnp-1.28",
pages = "163--168",
abstract = "In this paper we describe our system submitted to the FinTOC-2020 shared task on financial doc- ument structure extraction. We propose a two-step approach to identify titles in financial docu- ments and to extract their table of contents (TOC). First, we identify text blocks as candidates for titles using unsupervised learning based on character-level information of each document. Then, we apply supervised learning on a self-constructed regression task to predict the depth of each text block in the document structure hierarchy using transfer learning combined with document features and layout features. It is noteworthy that our single multilingual model performs well on both tasks and on different languages, which indicates the usefulness of transfer learning for title detection and TOC generation. Moreover, our approach is independent of the presence of actual TOC pages in the documents. It is also one of the few submissions to the FinTOC-2020 shared task addressing both subtasks in both languages, English and French, with one single model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haase-kirchhoff-2020-taxy">
<titleInfo>
<title>Taxy.io@FinTOC-2020: Multilingual Document Structure Extraction using Transfer Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Haase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Kirchhoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Vasiliki</namePart>
<namePart type="family">Athanasakou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Sira</namePart>
<namePart type="family">Ferradans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Catherine</namePart>
<namePart type="family">Salzedo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Ans</namePart>
<namePart type="family">Elhag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Marina</namePart>
<namePart type="family">Litvak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr</namePart>
<namePart type="given">George</namePart>
<namePart type="family">Giannakopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikiforos</namePart>
<namePart type="family">Pittaras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>COLING</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we describe our system submitted to the FinTOC-2020 shared task on financial doc- ument structure extraction. We propose a two-step approach to identify titles in financial docu- ments and to extract their table of contents (TOC). First, we identify text blocks as candidates for titles using unsupervised learning based on character-level information of each document. Then, we apply supervised learning on a self-constructed regression task to predict the depth of each text block in the document structure hierarchy using transfer learning combined with document features and layout features. It is noteworthy that our single multilingual model performs well on both tasks and on different languages, which indicates the usefulness of transfer learning for title detection and TOC generation. Moreover, our approach is independent of the presence of actual TOC pages in the documents. It is also one of the few submissions to the FinTOC-2020 shared task addressing both subtasks in both languages, English and French, with one single model.</abstract>
<identifier type="citekey">haase-kirchhoff-2020-taxy</identifier>
<location>
<url>https://aclanthology.org/2020.fnp-1.28</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>163</start>
<end>168</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Taxy.io@FinTOC-2020: Multilingual Document Structure Extraction using Transfer Learning
%A Haase, Frederic
%A Kirchhoff, Steffen
%Y El-Haj, Dr Mahmoud
%Y Athanasakou, Dr Vasiliki
%Y Ferradans, Dr Sira
%Y Salzedo, Dr Catherine
%Y Elhag, Dr Ans
%Y Bouamor, Dr Houda
%Y Litvak, Dr Marina
%Y Rayson, Dr Paul
%Y Giannakopoulos, Dr George
%Y Pittaras, Nikiforos
%S Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation
%D 2020
%8 December
%I COLING
%C Barcelona, Spain (Online)
%F haase-kirchhoff-2020-taxy
%X In this paper we describe our system submitted to the FinTOC-2020 shared task on financial doc- ument structure extraction. We propose a two-step approach to identify titles in financial docu- ments and to extract their table of contents (TOC). First, we identify text blocks as candidates for titles using unsupervised learning based on character-level information of each document. Then, we apply supervised learning on a self-constructed regression task to predict the depth of each text block in the document structure hierarchy using transfer learning combined with document features and layout features. It is noteworthy that our single multilingual model performs well on both tasks and on different languages, which indicates the usefulness of transfer learning for title detection and TOC generation. Moreover, our approach is independent of the presence of actual TOC pages in the documents. It is also one of the few submissions to the FinTOC-2020 shared task addressing both subtasks in both languages, English and French, with one single model.
%U https://aclanthology.org/2020.fnp-1.28
%P 163-168
Markdown (Informal)
[Taxy.io@FinTOC-2020: Multilingual Document Structure Extraction using Transfer Learning](https://aclanthology.org/2020.fnp-1.28) (Haase & Kirchhoff, FNP 2020)
ACL