@inproceedings{pertsas-etal-2024-annotated,
title = "An Annotated Dataset for Transformer-based Scholarly Information Extraction and Linguistic Linked Data Generation",
author = "Pertsas, Vayianos and
Kasapaki, Marialena and
Constantopoulos, Panos",
editor = "Chiarcos, Christian and
Gkirtzou, Katerina and
Ionov, Maxim and
Khan, Fahad and
McCrae, John P. and
Ponsoda, Elena Montiel and
Chozas, Patricia Mart{\'\i}n",
booktitle = "Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.ldl-1.11",
pages = "84--93",
abstract = "We present a manually curated and annotated, multidisciplinary dataset of 15,262 sentences from research articles (abstract and main text) that can be used for transformer-based extraction from scholarly publications of three types of entities: 1) research methods, named entities of variable length, 2) research goals, entities that appear as textual spans of variable length with mostly fixed lexico-syntactic-structure, and 3) research activities, entities that appear as textual spans of variable length with complex lexico-syntactic structure. We explore the capabilities of our dataset by using it for training/fine-tuning various ML and transformer-based models. We compare our finetuned models as well as LLM responses (chatGPT 3.5) based on 10-shot learning, by measuring F1 scores in token-based, entity-based strict and entity-based partial evaluations across interdisciplinary and discipline-specific datasets in order to capture any possible differences in discipline-oriented writing styles. Results show that fine tuning of transformer-based models significantly outperforms the performance of few- shot learning of LLMs such as chatGPT, highlighting the significance of annotation datasets in such tasks. Our dataset can also be used as a source for linguistic linked data by itself. We demonstrate this by presenting indicative queries in SPARQL, executed over such an RDF knowledge graph.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pertsas-etal-2024-annotated">
<titleInfo>
<title>An Annotated Dataset for Transformer-based Scholarly Information Extraction and Linguistic Linked Data Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vayianos</namePart>
<namePart type="family">Pertsas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marialena</namePart>
<namePart type="family">Kasapaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Panos</namePart>
<namePart type="family">Constantopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Chiarcos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Gkirtzou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fahad</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">P</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="given">Montiel</namePart>
<namePart type="family">Ponsoda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patricia</namePart>
<namePart type="given">Martín</namePart>
<namePart type="family">Chozas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a manually curated and annotated, multidisciplinary dataset of 15,262 sentences from research articles (abstract and main text) that can be used for transformer-based extraction from scholarly publications of three types of entities: 1) research methods, named entities of variable length, 2) research goals, entities that appear as textual spans of variable length with mostly fixed lexico-syntactic-structure, and 3) research activities, entities that appear as textual spans of variable length with complex lexico-syntactic structure. We explore the capabilities of our dataset by using it for training/fine-tuning various ML and transformer-based models. We compare our finetuned models as well as LLM responses (chatGPT 3.5) based on 10-shot learning, by measuring F1 scores in token-based, entity-based strict and entity-based partial evaluations across interdisciplinary and discipline-specific datasets in order to capture any possible differences in discipline-oriented writing styles. Results show that fine tuning of transformer-based models significantly outperforms the performance of few- shot learning of LLMs such as chatGPT, highlighting the significance of annotation datasets in such tasks. Our dataset can also be used as a source for linguistic linked data by itself. We demonstrate this by presenting indicative queries in SPARQL, executed over such an RDF knowledge graph.</abstract>
<identifier type="citekey">pertsas-etal-2024-annotated</identifier>
<location>
<url>https://aclanthology.org/2024.ldl-1.11</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>84</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Annotated Dataset for Transformer-based Scholarly Information Extraction and Linguistic Linked Data Generation
%A Pertsas, Vayianos
%A Kasapaki, Marialena
%A Constantopoulos, Panos
%Y Chiarcos, Christian
%Y Gkirtzou, Katerina
%Y Ionov, Maxim
%Y Khan, Fahad
%Y McCrae, John P.
%Y Ponsoda, Elena Montiel
%Y Chozas, Patricia Martín
%S Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F pertsas-etal-2024-annotated
%X We present a manually curated and annotated, multidisciplinary dataset of 15,262 sentences from research articles (abstract and main text) that can be used for transformer-based extraction from scholarly publications of three types of entities: 1) research methods, named entities of variable length, 2) research goals, entities that appear as textual spans of variable length with mostly fixed lexico-syntactic-structure, and 3) research activities, entities that appear as textual spans of variable length with complex lexico-syntactic structure. We explore the capabilities of our dataset by using it for training/fine-tuning various ML and transformer-based models. We compare our finetuned models as well as LLM responses (chatGPT 3.5) based on 10-shot learning, by measuring F1 scores in token-based, entity-based strict and entity-based partial evaluations across interdisciplinary and discipline-specific datasets in order to capture any possible differences in discipline-oriented writing styles. Results show that fine tuning of transformer-based models significantly outperforms the performance of few- shot learning of LLMs such as chatGPT, highlighting the significance of annotation datasets in such tasks. Our dataset can also be used as a source for linguistic linked data by itself. We demonstrate this by presenting indicative queries in SPARQL, executed over such an RDF knowledge graph.
%U https://aclanthology.org/2024.ldl-1.11
%P 84-93
Markdown (Informal)
[An Annotated Dataset for Transformer-based Scholarly Information Extraction and Linguistic Linked Data Generation](https://aclanthology.org/2024.ldl-1.11) (Pertsas et al., LDL-WS 2024)
ACL