@inproceedings{shukla-etal-2022-dosa,
title = "{D}o{SA} : A System to Accelerate Annotations on Business Documents with Human-in-the-Loop",
author = "Shukla, Neelesh and
Raja, Msp and
Katikeri, Raghu and
Vaid, Amit",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan and
Srivastava, Shashank",
booktitle = "Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.dash-1.4",
pages = "23--27",
abstract = "Business documents come in a variety of structures, formats and information needs which makes information extraction a challenging task. Due to these variations, having a document generic model which can work well across all types of documents for all the use cases seems far-fetched. For document-specific models, we would need customized document-specific labels. We introduce DoSA (Document Specific Automated Annotations), which helps annotators in generating initial annotations automatically using our novel bootstrap approach by leveraging document generic datasets and models. These initial annotations can further be reviewed by a human for correctness. An initial document-specific model can be trained and its inference can be used as feedback for generating more automated annotations. These automated annotations can be reviewed by humanin-the-loop for the correctness and a new improved model can be trained using the current model as pre-trained model before going for the next iteration. In this paper, our scope is limited to Form like documents due to limited availability of generic annotated datasets, but this idea can be extended to a variety of other documents as more datasets are built. An opensource ready-to-use implementation is made available on GitHub.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shukla-etal-2022-dosa">
<titleInfo>
<title>DoSA : A System to Accelerate Annotations on Business Documents with Human-in-the-Loop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neelesh</namePart>
<namePart type="family">Shukla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Msp</namePart>
<namePart type="family">Raja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raghu</namePart>
<namePart type="family">Katikeri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amit</namePart>
<namePart type="family">Vaid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashank</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Business documents come in a variety of structures, formats and information needs which makes information extraction a challenging task. Due to these variations, having a document generic model which can work well across all types of documents for all the use cases seems far-fetched. For document-specific models, we would need customized document-specific labels. We introduce DoSA (Document Specific Automated Annotations), which helps annotators in generating initial annotations automatically using our novel bootstrap approach by leveraging document generic datasets and models. These initial annotations can further be reviewed by a human for correctness. An initial document-specific model can be trained and its inference can be used as feedback for generating more automated annotations. These automated annotations can be reviewed by humanin-the-loop for the correctness and a new improved model can be trained using the current model as pre-trained model before going for the next iteration. In this paper, our scope is limited to Form like documents due to limited availability of generic annotated datasets, but this idea can be extended to a variety of other documents as more datasets are built. An opensource ready-to-use implementation is made available on GitHub.</abstract>
<identifier type="citekey">shukla-etal-2022-dosa</identifier>
<location>
<url>https://aclanthology.org/2022.dash-1.4</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>23</start>
<end>27</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DoSA : A System to Accelerate Annotations on Business Documents with Human-in-the-Loop
%A Shukla, Neelesh
%A Raja, Msp
%A Katikeri, Raghu
%A Vaid, Amit
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%Y Srivastava, Shashank
%S Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F shukla-etal-2022-dosa
%X Business documents come in a variety of structures, formats and information needs which makes information extraction a challenging task. Due to these variations, having a document generic model which can work well across all types of documents for all the use cases seems far-fetched. For document-specific models, we would need customized document-specific labels. We introduce DoSA (Document Specific Automated Annotations), which helps annotators in generating initial annotations automatically using our novel bootstrap approach by leveraging document generic datasets and models. These initial annotations can further be reviewed by a human for correctness. An initial document-specific model can be trained and its inference can be used as feedback for generating more automated annotations. These automated annotations can be reviewed by humanin-the-loop for the correctness and a new improved model can be trained using the current model as pre-trained model before going for the next iteration. In this paper, our scope is limited to Form like documents due to limited availability of generic annotated datasets, but this idea can be extended to a variety of other documents as more datasets are built. An opensource ready-to-use implementation is made available on GitHub.
%U https://aclanthology.org/2022.dash-1.4
%P 23-27
Markdown (Informal)
[DoSA : A System to Accelerate Annotations on Business Documents with Human-in-the-Loop](https://aclanthology.org/2022.dash-1.4) (Shukla et al., DaSH 2022)
ACL