@inproceedings{patel-etal-2020-use,
title = "On the Use of Web Search to Improve Scientific Collections",
author = "Patel, Krutarth and
Caragea, Cornelia and
Gollapalli, Sujatha Das",
editor = "Chandrasekaran, Muthu Kumar and
de Waard, Anita and
Feigenblat, Guy and
Freitag, Dayne and
Ghosal, Tirthankar and
Hovy, Eduard and
Knoth, Petr and
Konopnicki, David and
Mayr, Philipp and
Patton, Robert M. and
Shmueli-Scheuer, Michal",
booktitle = "Proceedings of the First Workshop on Scholarly Document Processing",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.sdp-1.20/",
doi = "10.18653/v1/2020.sdp-1.20",
pages = "174--183",
abstract = "Despite the advancements in search engine features, ranking methods, technologies, and the availability of programmable APIs, current-day open-access digital libraries still rely on crawl-based approaches for acquiring their underlying document collections. In this paper, we propose a novel search-driven framework for acquiring documents for such scientific portals. Within our framework, publicly-available research paper titles and author names are used as queries to a Web search engine. We were able to obtain {\textasciitilde}267,000 unique research papers through our fully-automated framework using {\textasciitilde}76,000 queries, resulting in almost 200,000 more papers than the number of queries. Moreover, through a combination of title and author name search, we were able to recover 78{\%} of the original searched titles."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="patel-etal-2020-use">
<titleInfo>
<title>On the Use of Web Search to Improve Scientific Collections</title>
</titleInfo>
<name type="personal">
<namePart type="given">Krutarth</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cornelia</namePart>
<namePart type="family">Caragea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujatha</namePart>
<namePart type="given">Das</namePart>
<namePart type="family">Gollapalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Scholarly Document Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muthu</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Chandrasekaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anita</namePart>
<namePart type="family">de Waard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Feigenblat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayne</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Konopnicki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Mayr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Patton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli-Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the advancements in search engine features, ranking methods, technologies, and the availability of programmable APIs, current-day open-access digital libraries still rely on crawl-based approaches for acquiring their underlying document collections. In this paper, we propose a novel search-driven framework for acquiring documents for such scientific portals. Within our framework, publicly-available research paper titles and author names are used as queries to a Web search engine. We were able to obtain ~267,000 unique research papers through our fully-automated framework using ~76,000 queries, resulting in almost 200,000 more papers than the number of queries. Moreover, through a combination of title and author name search, we were able to recover 78% of the original searched titles.</abstract>
<identifier type="citekey">patel-etal-2020-use</identifier>
<identifier type="doi">10.18653/v1/2020.sdp-1.20</identifier>
<location>
<url>https://aclanthology.org/2020.sdp-1.20/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>174</start>
<end>183</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Use of Web Search to Improve Scientific Collections
%A Patel, Krutarth
%A Caragea, Cornelia
%A Gollapalli, Sujatha Das
%Y Chandrasekaran, Muthu Kumar
%Y de Waard, Anita
%Y Feigenblat, Guy
%Y Freitag, Dayne
%Y Ghosal, Tirthankar
%Y Hovy, Eduard
%Y Knoth, Petr
%Y Konopnicki, David
%Y Mayr, Philipp
%Y Patton, Robert M.
%Y Shmueli-Scheuer, Michal
%S Proceedings of the First Workshop on Scholarly Document Processing
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F patel-etal-2020-use
%X Despite the advancements in search engine features, ranking methods, technologies, and the availability of programmable APIs, current-day open-access digital libraries still rely on crawl-based approaches for acquiring their underlying document collections. In this paper, we propose a novel search-driven framework for acquiring documents for such scientific portals. Within our framework, publicly-available research paper titles and author names are used as queries to a Web search engine. We were able to obtain ~267,000 unique research papers through our fully-automated framework using ~76,000 queries, resulting in almost 200,000 more papers than the number of queries. Moreover, through a combination of title and author name search, we were able to recover 78% of the original searched titles.
%R 10.18653/v1/2020.sdp-1.20
%U https://aclanthology.org/2020.sdp-1.20/
%U https://doi.org/10.18653/v1/2020.sdp-1.20
%P 174-183
Markdown (Informal)
[On the Use of Web Search to Improve Scientific Collections](https://aclanthology.org/2020.sdp-1.20/) (Patel et al., sdp 2020)
ACL