@inproceedings{zhang-etal-2024-knowvrdu,
title = "{K}now{V}r{DU}: A Unified Knowledge-aware Prompt-Tuning Framework for Visually-rich Document Understanding",
author = "Zhang, Yunqi and
Chen, Yubo and
Zhu, Jingzhe and
Xu, Jinyu and
Yang, Shuai and
Wu, Zhaoliang and
Huang, Liang and
Huang, Yongfeng and
Chen, Shuai",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.863",
pages = "9878--9889",
abstract = "In Visually-rich Document Understanding (VrDU), recent advances of incorporating layout and image features into the pre-training language models have achieved significant progress. Existing methods usually developed complicated dedicated architectures based on pre-trained models and fine-tuned them with costly high-quality data to eliminate the inconsistency of knowledge distribution between the pre-training task and specialized downstream tasks. However, due to their huge data demands, these methods are not suitable for few-shot settings, which are essential for quick applications with limited resources but few previous works are presented. To solve these problems, we propose a unified Knowledge-aware prompt-tuning framework for Visual-rich Document Understanding (KnowVrDU) to enable broad utilization for diverse concrete applications and reduce data requirements. To model heterogeneous VrDU structures without designing task-specific architectures, we propose to reformulate various VrDU tasks into a single question-answering format with task-specific prompts and train the pre-trained model with the parameter-efficient prompt tuning method. To bridge the knowledge gap between the pre-training task and specialized VrDU tasks without additional annotations, we propose a prompt knowledge integration mechanism to leverage external open-source knowledge bases. We conduct experiments on several benchmark datasets in few-shot settings and the results validate the effectiveness of our method.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-knowvrdu">
<titleInfo>
<title>KnowVrDU: A Unified Knowledge-aware Prompt-Tuning Framework for Visually-rich Document Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunqi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yubo</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingzhe</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyu</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaoliang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongfeng</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In Visually-rich Document Understanding (VrDU), recent advances of incorporating layout and image features into the pre-training language models have achieved significant progress. Existing methods usually developed complicated dedicated architectures based on pre-trained models and fine-tuned them with costly high-quality data to eliminate the inconsistency of knowledge distribution between the pre-training task and specialized downstream tasks. However, due to their huge data demands, these methods are not suitable for few-shot settings, which are essential for quick applications with limited resources but few previous works are presented. To solve these problems, we propose a unified Knowledge-aware prompt-tuning framework for Visual-rich Document Understanding (KnowVrDU) to enable broad utilization for diverse concrete applications and reduce data requirements. To model heterogeneous VrDU structures without designing task-specific architectures, we propose to reformulate various VrDU tasks into a single question-answering format with task-specific prompts and train the pre-trained model with the parameter-efficient prompt tuning method. To bridge the knowledge gap between the pre-training task and specialized VrDU tasks without additional annotations, we propose a prompt knowledge integration mechanism to leverage external open-source knowledge bases. We conduct experiments on several benchmark datasets in few-shot settings and the results validate the effectiveness of our method.</abstract>
<identifier type="citekey">zhang-etal-2024-knowvrdu</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.863</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>9878</start>
<end>9889</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KnowVrDU: A Unified Knowledge-aware Prompt-Tuning Framework for Visually-rich Document Understanding
%A Zhang, Yunqi
%A Chen, Yubo
%A Zhu, Jingzhe
%A Xu, Jinyu
%A Yang, Shuai
%A Wu, Zhaoliang
%A Huang, Liang
%A Huang, Yongfeng
%A Chen, Shuai
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F zhang-etal-2024-knowvrdu
%X In Visually-rich Document Understanding (VrDU), recent advances of incorporating layout and image features into the pre-training language models have achieved significant progress. Existing methods usually developed complicated dedicated architectures based on pre-trained models and fine-tuned them with costly high-quality data to eliminate the inconsistency of knowledge distribution between the pre-training task and specialized downstream tasks. However, due to their huge data demands, these methods are not suitable for few-shot settings, which are essential for quick applications with limited resources but few previous works are presented. To solve these problems, we propose a unified Knowledge-aware prompt-tuning framework for Visual-rich Document Understanding (KnowVrDU) to enable broad utilization for diverse concrete applications and reduce data requirements. To model heterogeneous VrDU structures without designing task-specific architectures, we propose to reformulate various VrDU tasks into a single question-answering format with task-specific prompts and train the pre-trained model with the parameter-efficient prompt tuning method. To bridge the knowledge gap between the pre-training task and specialized VrDU tasks without additional annotations, we propose a prompt knowledge integration mechanism to leverage external open-source knowledge bases. We conduct experiments on several benchmark datasets in few-shot settings and the results validate the effectiveness of our method.
%U https://aclanthology.org/2024.lrec-main.863
%P 9878-9889
Markdown (Informal)
[KnowVrDU: A Unified Knowledge-aware Prompt-Tuning Framework for Visually-rich Document Understanding](https://aclanthology.org/2024.lrec-main.863) (Zhang et al., LREC-COLING 2024)
ACL
- Yunqi Zhang, Yubo Chen, Jingzhe Zhu, Jinyu Xu, Shuai Yang, Zhaoliang Wu, Liang Huang, Yongfeng Huang, and Shuai Chen. 2024. KnowVrDU: A Unified Knowledge-aware Prompt-Tuning Framework for Visually-rich Document Understanding. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 9878–9889, Torino, Italia. ELRA and ICCL.