@inproceedings{ahuja-etal-2022-parsing,
title = "Parsing Electronic Theses and Dissertations Using Object Detection",
author = "Ahuja, Aman and
Devera, Alan and
Fox, Edward Alan",
booktitle = "Proceedings of the first Workshop on Information Extraction from Scientific Publications",
month = nov,
year = "2022",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wiesp-1.14",
pages = "121--130",
abstract = "Electronic theses and dissertations (ETDs) contain valuable knowledge that can be useful for a wide range of purposes. To effectively utilize the knowledge contained in ETDs for downstream tasks such as search and retrieval, question-answering, and summarization, the data first needs to be parsed and stored in a format such as XML. However, since most of the ETDs available on the web are PDF documents, parsing them to make their data useful for downstream tasks is a challenge. In this work, we propose a dataset and a framework to help with parsing long scholarly documents such as ETDs. We take the Object Detection approach for document parsing. We first introduce a set of objects that are important elements of an ETD, along with a new dataset ETD-OD that consists of over 25K page images originating from 200 ETDs with bounding boxes around each of the objects. We also propose a framework that utilizes this dataset for converting ETDs to XML, which can further be used for ETD-related downstream tasks. Our code and pre-trained models are available at: https://github.com/Opening-ETDs/ETD-OD.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahuja-etal-2022-parsing">
<titleInfo>
<title>Parsing Electronic Theses and Dissertations Using Object Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Ahuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Devera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="given">Alan</namePart>
<namePart type="family">Fox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the first Workshop on Information Extraction from Scientific Publications</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Electronic theses and dissertations (ETDs) contain valuable knowledge that can be useful for a wide range of purposes. To effectively utilize the knowledge contained in ETDs for downstream tasks such as search and retrieval, question-answering, and summarization, the data first needs to be parsed and stored in a format such as XML. However, since most of the ETDs available on the web are PDF documents, parsing them to make their data useful for downstream tasks is a challenge. In this work, we propose a dataset and a framework to help with parsing long scholarly documents such as ETDs. We take the Object Detection approach for document parsing. We first introduce a set of objects that are important elements of an ETD, along with a new dataset ETD-OD that consists of over 25K page images originating from 200 ETDs with bounding boxes around each of the objects. We also propose a framework that utilizes this dataset for converting ETDs to XML, which can further be used for ETD-related downstream tasks. Our code and pre-trained models are available at: https://github.com/Opening-ETDs/ETD-OD.</abstract>
<identifier type="citekey">ahuja-etal-2022-parsing</identifier>
<location>
<url>https://aclanthology.org/2022.wiesp-1.14</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>121</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parsing Electronic Theses and Dissertations Using Object Detection
%A Ahuja, Aman
%A Devera, Alan
%A Fox, Edward Alan
%S Proceedings of the first Workshop on Information Extraction from Scientific Publications
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online
%F ahuja-etal-2022-parsing
%X Electronic theses and dissertations (ETDs) contain valuable knowledge that can be useful for a wide range of purposes. To effectively utilize the knowledge contained in ETDs for downstream tasks such as search and retrieval, question-answering, and summarization, the data first needs to be parsed and stored in a format such as XML. However, since most of the ETDs available on the web are PDF documents, parsing them to make their data useful for downstream tasks is a challenge. In this work, we propose a dataset and a framework to help with parsing long scholarly documents such as ETDs. We take the Object Detection approach for document parsing. We first introduce a set of objects that are important elements of an ETD, along with a new dataset ETD-OD that consists of over 25K page images originating from 200 ETDs with bounding boxes around each of the objects. We also propose a framework that utilizes this dataset for converting ETDs to XML, which can further be used for ETD-related downstream tasks. Our code and pre-trained models are available at: https://github.com/Opening-ETDs/ETD-OD.
%U https://aclanthology.org/2022.wiesp-1.14
%P 121-130
Markdown (Informal)
[Parsing Electronic Theses and Dissertations Using Object Detection](https://aclanthology.org/2022.wiesp-1.14) (Ahuja et al., WIESP 2022)
ACL