@inproceedings{sage-etal-2020-end,
title = "End-to-End Extraction of Structured Information from Business Documents with Pointer-Generator Networks",
author = "Sage, Cl{\'e}ment and
Aussem, Alex and
Eglin, V{\'e}ronique and
Elghazel, Haytham and
Espinas, J{\'e}r{\'e}my",
editor = "Agrawal, Priyanka and
Kozareva, Zornitsa and
Kreutzer, Julia and
Lampouras, Gerasimos and
Martins, Andr{\'e} and
Ravi, Sujith and
Vlachos, Andreas",
booktitle = "Proceedings of the Fourth Workshop on Structured Prediction for NLP",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.spnlp-1.6",
doi = "10.18653/v1/2020.spnlp-1.6",
pages = "43--52",
abstract = "The predominant approaches for extracting key information from documents resort to classifiers predicting the information type of each word. However, the word level ground truth used for learning is expensive to obtain since it is not naturally produced by the extraction task. In this paper, we discuss a new method for training extraction models directly from the textual value of information. The extracted information of a document is represented as a sequence of tokens in the XML language. We learn to output this representation with a pointer-generator network that alternately copies the document words carrying information and generates the XML tags delimiting the types of information. The ability of our end-to-end method to retrieve structured information is assessed on a large set of business documents. We show that it performs competitively with a standard word classifier without requiring costly word level supervision.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sage-etal-2020-end">
<titleInfo>
<title>End-to-End Extraction of Structured Information from Business Documents with Pointer-Generator Networks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Clément</namePart>
<namePart type="family">Sage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Aussem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Véronique</namePart>
<namePart type="family">Eglin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haytham</namePart>
<namePart type="family">Elghazel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jérémy</namePart>
<namePart type="family">Espinas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Structured Prediction for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Priyanka</namePart>
<namePart type="family">Agrawal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Kreutzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Lampouras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujith</namePart>
<namePart type="family">Ravi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The predominant approaches for extracting key information from documents resort to classifiers predicting the information type of each word. However, the word level ground truth used for learning is expensive to obtain since it is not naturally produced by the extraction task. In this paper, we discuss a new method for training extraction models directly from the textual value of information. The extracted information of a document is represented as a sequence of tokens in the XML language. We learn to output this representation with a pointer-generator network that alternately copies the document words carrying information and generates the XML tags delimiting the types of information. The ability of our end-to-end method to retrieve structured information is assessed on a large set of business documents. We show that it performs competitively with a standard word classifier without requiring costly word level supervision.</abstract>
<identifier type="citekey">sage-etal-2020-end</identifier>
<identifier type="doi">10.18653/v1/2020.spnlp-1.6</identifier>
<location>
<url>https://aclanthology.org/2020.spnlp-1.6</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>43</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T End-to-End Extraction of Structured Information from Business Documents with Pointer-Generator Networks
%A Sage, Clément
%A Aussem, Alex
%A Eglin, Véronique
%A Elghazel, Haytham
%A Espinas, Jérémy
%Y Agrawal, Priyanka
%Y Kozareva, Zornitsa
%Y Kreutzer, Julia
%Y Lampouras, Gerasimos
%Y Martins, André
%Y Ravi, Sujith
%Y Vlachos, Andreas
%S Proceedings of the Fourth Workshop on Structured Prediction for NLP
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F sage-etal-2020-end
%X The predominant approaches for extracting key information from documents resort to classifiers predicting the information type of each word. However, the word level ground truth used for learning is expensive to obtain since it is not naturally produced by the extraction task. In this paper, we discuss a new method for training extraction models directly from the textual value of information. The extracted information of a document is represented as a sequence of tokens in the XML language. We learn to output this representation with a pointer-generator network that alternately copies the document words carrying information and generates the XML tags delimiting the types of information. The ability of our end-to-end method to retrieve structured information is assessed on a large set of business documents. We show that it performs competitively with a standard word classifier without requiring costly word level supervision.
%R 10.18653/v1/2020.spnlp-1.6
%U https://aclanthology.org/2020.spnlp-1.6
%U https://doi.org/10.18653/v1/2020.spnlp-1.6
%P 43-52
Markdown (Informal)
[End-to-End Extraction of Structured Information from Business Documents with Pointer-Generator Networks](https://aclanthology.org/2020.spnlp-1.6) (Sage et al., spnlp 2020)
ACL