@inproceedings{sandu-etal-2022-large,
title = "Large Sequence Representation Learning via Multi-Stage Latent Transformers",
author = "Sandu, Ionut-Catalin and
Voinea, Daniel and
Popa, Alin-Ionut",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.410",
pages = "4633--4639",
abstract = "We present LANTERN, a multi-stage transformer architecture for named-entity recognition (NER) designed to operate on indefinitely large text sequences (i.e. {\textgreater} 512 elements). For a given image of a form with structured text, our method uses language and spatial features to predict the entity tags of each text element. It breaks the quadratic computational constraints of the attention mechanism by operating over a learned latent space representation which encodes the input sequence via the cross-attention mechanism while having the multi-stage encoding component as a refinement over the NER predictions. As a proxy task, we propose RADAR, an LSTM classifier operating at character level, which predicts the relevance of a word with respect to the entity-recognition task. Additionally, we formulate a challenging novel NER use case, nutritional information extraction from food product labels. We created a dataset with 11,926 images depicting food product labels entitled TREAT dataset, with fully detailed annotations. Our method achieves superior performance against two competitive models designed for long sequences on the proposed TREAT dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sandu-etal-2022-large">
<titleInfo>
<title>Large Sequence Representation Learning via Multi-Stage Latent Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ionut-Catalin</namePart>
<namePart type="family">Sandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Voinea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alin-Ionut</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present LANTERN, a multi-stage transformer architecture for named-entity recognition (NER) designed to operate on indefinitely large text sequences (i.e. \textgreater 512 elements). For a given image of a form with structured text, our method uses language and spatial features to predict the entity tags of each text element. It breaks the quadratic computational constraints of the attention mechanism by operating over a learned latent space representation which encodes the input sequence via the cross-attention mechanism while having the multi-stage encoding component as a refinement over the NER predictions. As a proxy task, we propose RADAR, an LSTM classifier operating at character level, which predicts the relevance of a word with respect to the entity-recognition task. Additionally, we formulate a challenging novel NER use case, nutritional information extraction from food product labels. We created a dataset with 11,926 images depicting food product labels entitled TREAT dataset, with fully detailed annotations. Our method achieves superior performance against two competitive models designed for long sequences on the proposed TREAT dataset.</abstract>
<identifier type="citekey">sandu-etal-2022-large</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.410</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>4633</start>
<end>4639</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Sequence Representation Learning via Multi-Stage Latent Transformers
%A Sandu, Ionut-Catalin
%A Voinea, Daniel
%A Popa, Alin-Ionut
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F sandu-etal-2022-large
%X We present LANTERN, a multi-stage transformer architecture for named-entity recognition (NER) designed to operate on indefinitely large text sequences (i.e. \textgreater 512 elements). For a given image of a form with structured text, our method uses language and spatial features to predict the entity tags of each text element. It breaks the quadratic computational constraints of the attention mechanism by operating over a learned latent space representation which encodes the input sequence via the cross-attention mechanism while having the multi-stage encoding component as a refinement over the NER predictions. As a proxy task, we propose RADAR, an LSTM classifier operating at character level, which predicts the relevance of a word with respect to the entity-recognition task. Additionally, we formulate a challenging novel NER use case, nutritional information extraction from food product labels. We created a dataset with 11,926 images depicting food product labels entitled TREAT dataset, with fully detailed annotations. Our method achieves superior performance against two competitive models designed for long sequences on the proposed TREAT dataset.
%U https://aclanthology.org/2022.coling-1.410
%P 4633-4639
Markdown (Informal)
[Large Sequence Representation Learning via Multi-Stage Latent Transformers](https://aclanthology.org/2022.coling-1.410) (Sandu et al., COLING 2022)
ACL