@inproceedings{chivers-etal-2022-ants,
title = "{ANTS}: A Framework for Retrieval of Text Segments in Unstructured Documents",
author = "Chivers, Brian and
Jiang, Mason P. and
Lee, Wonhee and
Ng, Amy and
Rapstine, Natalya I. and
Storer, Alex",
editor = "Cherry, Colin and
Fan, Angela and
Foster, George and
Haffari, Gholamreza (Reza) and
Khadivi, Shahram and
Peng, Nanyun (Violet) and
Ren, Xiang and
Shareghi, Ehsan and
Swayamdipta, Swabha",
booktitle = "Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing",
month = jul,
year = "2022",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.deeplo-1.5",
doi = "10.18653/v1/2022.deeplo-1.5",
pages = "38--47",
abstract = "Text segmentation and extraction from unstructured documents can provide business researchers with a wealth of new information on firms and their behaviors. However, the most valuable text is often difficult to extract consistently due to substantial variations in how content can appear from document to document. Thus, the most successful way to extract this content has been through costly crowdsourcing and training of manual workers. We propose the Assisted Neural Text Segmentation (ANTS) framework to identify pertinent text in unstructured documents from a small set of labeled examples. ANTS leverages deep learning and transfer learning architectures to empower researchers to identify relevant text with minimal manual coding. Using a real world sample of accounting documents, we identify targeted sections 96{\%} of the time using only 5 training examples.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chivers-etal-2022-ants">
<titleInfo>
<title>ANTS: A Framework for Retrieval of Text Segments in Unstructured Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Chivers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mason</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wonhee</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amy</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalya</namePart>
<namePart type="given">I</namePart>
<namePart type="family">Rapstine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Storer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Cherry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Foster</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gholamreza</namePart>
<namePart type="given">(Reza)</namePart>
<namePart type="family">Haffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shahram</namePart>
<namePart type="family">Khadivi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="given">(Violet)</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swabha</namePart>
<namePart type="family">Swayamdipta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text segmentation and extraction from unstructured documents can provide business researchers with a wealth of new information on firms and their behaviors. However, the most valuable text is often difficult to extract consistently due to substantial variations in how content can appear from document to document. Thus, the most successful way to extract this content has been through costly crowdsourcing and training of manual workers. We propose the Assisted Neural Text Segmentation (ANTS) framework to identify pertinent text in unstructured documents from a small set of labeled examples. ANTS leverages deep learning and transfer learning architectures to empower researchers to identify relevant text with minimal manual coding. Using a real world sample of accounting documents, we identify targeted sections 96% of the time using only 5 training examples.</abstract>
<identifier type="citekey">chivers-etal-2022-ants</identifier>
<identifier type="doi">10.18653/v1/2022.deeplo-1.5</identifier>
<location>
<url>https://aclanthology.org/2022.deeplo-1.5</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>38</start>
<end>47</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ANTS: A Framework for Retrieval of Text Segments in Unstructured Documents
%A Chivers, Brian
%A Jiang, Mason P.
%A Lee, Wonhee
%A Ng, Amy
%A Rapstine, Natalya I.
%A Storer, Alex
%Y Cherry, Colin
%Y Fan, Angela
%Y Foster, George
%Y Haffari, Gholamreza (Reza)
%Y Khadivi, Shahram
%Y Peng, Nanyun (Violet)
%Y Ren, Xiang
%Y Shareghi, Ehsan
%Y Swayamdipta, Swabha
%S Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing
%D 2022
%8 July
%I Association for Computational Linguistics
%C Hybrid
%F chivers-etal-2022-ants
%X Text segmentation and extraction from unstructured documents can provide business researchers with a wealth of new information on firms and their behaviors. However, the most valuable text is often difficult to extract consistently due to substantial variations in how content can appear from document to document. Thus, the most successful way to extract this content has been through costly crowdsourcing and training of manual workers. We propose the Assisted Neural Text Segmentation (ANTS) framework to identify pertinent text in unstructured documents from a small set of labeled examples. ANTS leverages deep learning and transfer learning architectures to empower researchers to identify relevant text with minimal manual coding. Using a real world sample of accounting documents, we identify targeted sections 96% of the time using only 5 training examples.
%R 10.18653/v1/2022.deeplo-1.5
%U https://aclanthology.org/2022.deeplo-1.5
%U https://doi.org/10.18653/v1/2022.deeplo-1.5
%P 38-47
Markdown (Informal)
[ANTS: A Framework for Retrieval of Text Segments in Unstructured Documents](https://aclanthology.org/2022.deeplo-1.5) (Chivers et al., DeepLo 2022)
ACL