@inproceedings{holt-chisholm-2018-extracting,
title = "Extracting structured data from invoices",
author = "Holt, Xavier and
Chisholm, Andrew",
editor = "Kim, Sunghwan Mac and
Zhang, Xiuzhen (Jenny)",
booktitle = "Proceedings of the Australasian Language Technology Association Workshop 2018",
month = dec,
year = "2018",
address = "Dunedin, New Zealand",
url = "https://aclanthology.org/U18-1006",
pages = "53--59",
abstract = "Business documents encode a wealth of information in a format tailored to human consumption {--} i.e. aesthetically disbursed natural language text, graphics and tables. We address the task of extracting key fields (e.g. the amount due on an invoice) from a wide-variety of potentially unseen document formats. In contrast to traditional template driven extraction systems, we introduce a content-driven machine-learning approach which is both robust to noise and generalises to unseen document formats. In a comparison of our approach with alternative invoice extraction systems, we observe an absolute accuracy gain of 20{\textbackslash}{\%} across compared fields, and a 25{\textbackslash}{\%}{--}94{\textbackslash}{\%} reduction in extraction latency.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="holt-chisholm-2018-extracting">
<titleInfo>
<title>Extracting structured data from invoices</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xavier</namePart>
<namePart type="family">Holt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Chisholm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Australasian Language Technology Association Workshop 2018</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunghwan</namePart>
<namePart type="given">Mac</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiuzhen</namePart>
<namePart type="given">(Jenny)</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Dunedin, New Zealand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Business documents encode a wealth of information in a format tailored to human consumption – i.e. aesthetically disbursed natural language text, graphics and tables. We address the task of extracting key fields (e.g. the amount due on an invoice) from a wide-variety of potentially unseen document formats. In contrast to traditional template driven extraction systems, we introduce a content-driven machine-learning approach which is both robust to noise and generalises to unseen document formats. In a comparison of our approach with alternative invoice extraction systems, we observe an absolute accuracy gain of 20\textbackslash% across compared fields, and a 25\textbackslash%–94\textbackslash% reduction in extraction latency.</abstract>
<identifier type="citekey">holt-chisholm-2018-extracting</identifier>
<location>
<url>https://aclanthology.org/U18-1006</url>
</location>
<part>
<date>2018-12</date>
<extent unit="page">
<start>53</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Extracting structured data from invoices
%A Holt, Xavier
%A Chisholm, Andrew
%Y Kim, Sunghwan Mac
%Y Zhang, Xiuzhen (Jenny)
%S Proceedings of the Australasian Language Technology Association Workshop 2018
%D 2018
%8 December
%C Dunedin, New Zealand
%F holt-chisholm-2018-extracting
%X Business documents encode a wealth of information in a format tailored to human consumption – i.e. aesthetically disbursed natural language text, graphics and tables. We address the task of extracting key fields (e.g. the amount due on an invoice) from a wide-variety of potentially unseen document formats. In contrast to traditional template driven extraction systems, we introduce a content-driven machine-learning approach which is both robust to noise and generalises to unseen document formats. In a comparison of our approach with alternative invoice extraction systems, we observe an absolute accuracy gain of 20\textbackslash% across compared fields, and a 25\textbackslash%–94\textbackslash% reduction in extraction latency.
%U https://aclanthology.org/U18-1006
%P 53-59
Markdown (Informal)
[Extracting structured data from invoices](https://aclanthology.org/U18-1006) (Holt & Chisholm, ALTA 2018)
ACL