@InProceedings{Holt:Chisholm:2018:ALTA2018,
author	 = {Holt, Xavier and Chisholm, Andrew},
title		 = {Extracting structured data from invoices},
booktitle = {Proceedings of the Australasian Language Technology Association Workshop 2018},
month	 = {December},
year		 = {2018},
address	 = {Dunedin, New Zealand},
pages	 = {53--59},
abstract  = {Business documents encode a wealth of information in a format tailored to human consumption -- i.e. aesthetically disbursed natural language text, graphics and tables. We address the task of extracting key fields (e.g. the amount due on an invoice) from a wide-variety of potentially unseen document formats. In contrast to traditional template driven extraction systems, we introduce a content-driven machine-learning approach which is both robust to noise and generalises to unseen document formats. In a comparison of our approach with alternative invoice extraction systems, we observe an absolute accuracy gain of 20\% across compared fields, and a 25\%--94\% reduction in extraction latency.},
url			 = {http://www.aclweb.org/anthology/U18-1006}
}
