@inproceedings{martin-etal-2022-nlpsharedtasks,
title = "{NLPS}hared{T}asks: A Corpus of Shared Task Overview Papers in Natural Language Processing Domains",
author = "Martin, Anna and
Pedersen, Ted and
D{'}Souza, Jennifer",
editor = "Ghosal, Tirthankar and
Blanco-Cuaresma, Sergi and
Accomazzi, Alberto and
Patton, Robert M. and
Grezes, Felix and
Allen, Thomas",
booktitle = "Proceedings of the first Workshop on Information Extraction from Scientific Publications",
month = nov,
year = "2022",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wiesp-1.13",
doi = "10.18653/v1/2022.wiesp-1.13",
pages = "105--120",
abstract = "As the rate of scientific output continues to grow, it is increasingly important to develop systems to improve interfaces between researchers and scholarly papers. Training models to extract scientific information from the full texts of scholarly documents is important for improving how we structure and access scientific information. However, there are few annotated corpora that provide full paper texts. This paper presents the NLPSharedTasks corpus, a new resource of 254 full text Shared Task Overview papers in NLP domains with annotated task descriptions. We calculated strict and relaxed inter-annotator agreement scores, achieving Cohen{'}s kappa coefficients of 0.44 and 0.95, respectively. Lastly, we performed a sentence classification task over the dataset, in order to generate a neural baseline for future research and to provide an example of how to preprocess unbalanced datasets of full scientific texts. We achieved an F1 score of 0.75 using SciBERT, fine-tuned and tested on a rebalanced version of the dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="martin-etal-2022-nlpsharedtasks">
<titleInfo>
<title>NLPSharedTasks: A Corpus of Shared Task Overview Papers in Natural Language Processing Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Martin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ted</namePart>
<namePart type="family">Pedersen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">D’Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the first Workshop on Information Extraction from Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergi</namePart>
<namePart type="family">Blanco-Cuaresma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Patton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Grezes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Allen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As the rate of scientific output continues to grow, it is increasingly important to develop systems to improve interfaces between researchers and scholarly papers. Training models to extract scientific information from the full texts of scholarly documents is important for improving how we structure and access scientific information. However, there are few annotated corpora that provide full paper texts. This paper presents the NLPSharedTasks corpus, a new resource of 254 full text Shared Task Overview papers in NLP domains with annotated task descriptions. We calculated strict and relaxed inter-annotator agreement scores, achieving Cohen’s kappa coefficients of 0.44 and 0.95, respectively. Lastly, we performed a sentence classification task over the dataset, in order to generate a neural baseline for future research and to provide an example of how to preprocess unbalanced datasets of full scientific texts. We achieved an F1 score of 0.75 using SciBERT, fine-tuned and tested on a rebalanced version of the dataset.</abstract>
<identifier type="citekey">martin-etal-2022-nlpsharedtasks</identifier>
<identifier type="doi">10.18653/v1/2022.wiesp-1.13</identifier>
<location>
<url>https://aclanthology.org/2022.wiesp-1.13</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>105</start>
<end>120</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NLPSharedTasks: A Corpus of Shared Task Overview Papers in Natural Language Processing Domains
%A Martin, Anna
%A Pedersen, Ted
%A D’Souza, Jennifer
%Y Ghosal, Tirthankar
%Y Blanco-Cuaresma, Sergi
%Y Accomazzi, Alberto
%Y Patton, Robert M.
%Y Grezes, Felix
%Y Allen, Thomas
%S Proceedings of the first Workshop on Information Extraction from Scientific Publications
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online
%F martin-etal-2022-nlpsharedtasks
%X As the rate of scientific output continues to grow, it is increasingly important to develop systems to improve interfaces between researchers and scholarly papers. Training models to extract scientific information from the full texts of scholarly documents is important for improving how we structure and access scientific information. However, there are few annotated corpora that provide full paper texts. This paper presents the NLPSharedTasks corpus, a new resource of 254 full text Shared Task Overview papers in NLP domains with annotated task descriptions. We calculated strict and relaxed inter-annotator agreement scores, achieving Cohen’s kappa coefficients of 0.44 and 0.95, respectively. Lastly, we performed a sentence classification task over the dataset, in order to generate a neural baseline for future research and to provide an example of how to preprocess unbalanced datasets of full scientific texts. We achieved an F1 score of 0.75 using SciBERT, fine-tuned and tested on a rebalanced version of the dataset.
%R 10.18653/v1/2022.wiesp-1.13
%U https://aclanthology.org/2022.wiesp-1.13
%U https://doi.org/10.18653/v1/2022.wiesp-1.13
%P 105-120
Markdown (Informal)
[NLPSharedTasks: A Corpus of Shared Task Overview Papers in Natural Language Processing Domains](https://aclanthology.org/2022.wiesp-1.13) (Martin et al., WIESP 2022)
ACL