@inproceedings{nourbakhsh-etal-2024-aligatr,
title = "{A}li{GAT}r: Graph-based layout generation for form understanding",
author = "Nourbakhsh, Armineh and
Jin, Zhao and
Parekh, Siddharth and
Shah, Sameena and
Rose, Carolyn",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.778/",
doi = "10.18653/v1/2024.findings-emnlp.778",
pages = "13309--13328",
abstract = "Forms constitute a large portion of layout-rich documents that convey information through key-value pairs. Form understanding involves two main tasks, namely, the identification of keys and values (a.k.a Key Information Extraction or KIE) and the association of keys to corresponding values (a.k.a. Relation Extraction or RE). State of the art models for form understanding often rely on training paradigms that yield poorly calibrated output probabilities and low performance on RE. In this paper, we present AliGATr, a graph-based model that uses a generative objective to represent complex grid-like layouts that are often found in forms. Using a grid-based graph topology, our model learns to generate the layout of each page token by token in a data efficient manner. Despite using 30{\%} fewer parameters than the smallest SotA, AliGATr performs on par with or better than SotA models on the KIE and RE tasks against four datasets. We also show that AliGATr`s output probabilities are better calibrated and do not exhibit the over-confident distributions of other SotA models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nourbakhsh-etal-2024-aligatr">
<titleInfo>
<title>AliGATr: Graph-based layout generation for form understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Armineh</namePart>
<namePart type="family">Nourbakhsh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddharth</namePart>
<namePart type="family">Parekh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sameena</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Forms constitute a large portion of layout-rich documents that convey information through key-value pairs. Form understanding involves two main tasks, namely, the identification of keys and values (a.k.a Key Information Extraction or KIE) and the association of keys to corresponding values (a.k.a. Relation Extraction or RE). State of the art models for form understanding often rely on training paradigms that yield poorly calibrated output probabilities and low performance on RE. In this paper, we present AliGATr, a graph-based model that uses a generative objective to represent complex grid-like layouts that are often found in forms. Using a grid-based graph topology, our model learns to generate the layout of each page token by token in a data efficient manner. Despite using 30% fewer parameters than the smallest SotA, AliGATr performs on par with or better than SotA models on the KIE and RE tasks against four datasets. We also show that AliGATr‘s output probabilities are better calibrated and do not exhibit the over-confident distributions of other SotA models.</abstract>
<identifier type="citekey">nourbakhsh-etal-2024-aligatr</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.778</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.778/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>13309</start>
<end>13328</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AliGATr: Graph-based layout generation for form understanding
%A Nourbakhsh, Armineh
%A Jin, Zhao
%A Parekh, Siddharth
%A Shah, Sameena
%A Rose, Carolyn
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F nourbakhsh-etal-2024-aligatr
%X Forms constitute a large portion of layout-rich documents that convey information through key-value pairs. Form understanding involves two main tasks, namely, the identification of keys and values (a.k.a Key Information Extraction or KIE) and the association of keys to corresponding values (a.k.a. Relation Extraction or RE). State of the art models for form understanding often rely on training paradigms that yield poorly calibrated output probabilities and low performance on RE. In this paper, we present AliGATr, a graph-based model that uses a generative objective to represent complex grid-like layouts that are often found in forms. Using a grid-based graph topology, our model learns to generate the layout of each page token by token in a data efficient manner. Despite using 30% fewer parameters than the smallest SotA, AliGATr performs on par with or better than SotA models on the KIE and RE tasks against four datasets. We also show that AliGATr‘s output probabilities are better calibrated and do not exhibit the over-confident distributions of other SotA models.
%R 10.18653/v1/2024.findings-emnlp.778
%U https://aclanthology.org/2024.findings-emnlp.778/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.778
%P 13309-13328
Markdown (Informal)
[AliGATr: Graph-based layout generation for form understanding](https://aclanthology.org/2024.findings-emnlp.778/) (Nourbakhsh et al., Findings 2024)
ACL
- Armineh Nourbakhsh, Zhao Jin, Siddharth Parekh, Sameena Shah, and Carolyn Rose. 2024. AliGATr: Graph-based layout generation for form understanding. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 13309–13328, Miami, Florida, USA. Association for Computational Linguistics.