@inproceedings{hervieux-etal-2024-language,
title = "Language Resources From Prominent Born-Digital Humanities Texts are Still Needed in the Age of {LLM}s",
author = "Hervieux, Natalie and
Yao, Peiran and
Brown, Susan and
Barbosa, Denilson",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Miyagawa, So and
Alnajjar, Khalid and
Bizzoni, Yuri},
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.9",
pages = "85--104",
abstract = "The digital humanities (DH) community fundamentally embraces the use of computerized tools for the study and creation of knowledge related to language, history, culture, and human values, in which natural language plays a prominent role. Many successful DH tools rely heavily on Natural Language Processing methods, and several efforts exist within the DH community to promote the use of newer and better tools. Nevertheless, most NLP research is driven by web corpora that are noticeably different from texts commonly found in DH artifacts, which tend to use richer language and refer to rarer entities. Thus, the near-human performance achieved by state-of-the-art NLP tools on web texts might not be achievable on DH texts. We introduce a dataset carefully created by computer scientists and digital humanists intended to serve as a reference point for the development and evaluation of NLP tools. The dataset is a subset of a born-digital textbase resulting from a prominent and ongoing experiment in digital literary history, containing thousands of multi-sentence excerpts that are suited for information extraction tasks. We fully describe the dataset and show that its language is demonstrably different than the corpora normally used in training language resources in the NLP community.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hervieux-etal-2024-language">
<titleInfo>
<title>Language Resources From Prominent Born-Digital Humanities Texts are Still Needed in the Age of LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Hervieux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peiran</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Susan</namePart>
<namePart type="family">Brown</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denilson</namePart>
<namePart type="family">Barbosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The digital humanities (DH) community fundamentally embraces the use of computerized tools for the study and creation of knowledge related to language, history, culture, and human values, in which natural language plays a prominent role. Many successful DH tools rely heavily on Natural Language Processing methods, and several efforts exist within the DH community to promote the use of newer and better tools. Nevertheless, most NLP research is driven by web corpora that are noticeably different from texts commonly found in DH artifacts, which tend to use richer language and refer to rarer entities. Thus, the near-human performance achieved by state-of-the-art NLP tools on web texts might not be achievable on DH texts. We introduce a dataset carefully created by computer scientists and digital humanists intended to serve as a reference point for the development and evaluation of NLP tools. The dataset is a subset of a born-digital textbase resulting from a prominent and ongoing experiment in digital literary history, containing thousands of multi-sentence excerpts that are suited for information extraction tasks. We fully describe the dataset and show that its language is demonstrably different than the corpora normally used in training language resources in the NLP community.</abstract>
<identifier type="citekey">hervieux-etal-2024-language</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4dh-1.9</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>85</start>
<end>104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Resources From Prominent Born-Digital Humanities Texts are Still Needed in the Age of LLMs
%A Hervieux, Natalie
%A Yao, Peiran
%A Brown, Susan
%A Barbosa, Denilson
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Miyagawa, So
%Y Alnajjar, Khalid
%Y Bizzoni, Yuri
%S Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, USA
%F hervieux-etal-2024-language
%X The digital humanities (DH) community fundamentally embraces the use of computerized tools for the study and creation of knowledge related to language, history, culture, and human values, in which natural language plays a prominent role. Many successful DH tools rely heavily on Natural Language Processing methods, and several efforts exist within the DH community to promote the use of newer and better tools. Nevertheless, most NLP research is driven by web corpora that are noticeably different from texts commonly found in DH artifacts, which tend to use richer language and refer to rarer entities. Thus, the near-human performance achieved by state-of-the-art NLP tools on web texts might not be achievable on DH texts. We introduce a dataset carefully created by computer scientists and digital humanists intended to serve as a reference point for the development and evaluation of NLP tools. The dataset is a subset of a born-digital textbase resulting from a prominent and ongoing experiment in digital literary history, containing thousands of multi-sentence excerpts that are suited for information extraction tasks. We fully describe the dataset and show that its language is demonstrably different than the corpora normally used in training language resources in the NLP community.
%U https://aclanthology.org/2024.nlp4dh-1.9
%P 85-104
Markdown (Informal)
[Language Resources From Prominent Born-Digital Humanities Texts are Still Needed in the Age of LLMs](https://aclanthology.org/2024.nlp4dh-1.9) (Hervieux et al., NLP4DH 2024)
ACL