@inproceedings{manrique-gomez-etal-2024-historical,
title = "Historical Ink: 19th Century {L}atin {A}merican {S}panish Newspaper Corpus with {LLM} {OCR} Correction",
author = "Manrique-Gomez, Laura and
Montes, Tony and
Rodriguez Herrera, Arturo and
Manrique, Ruben",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Miyagawa, So and
Alnajjar, Khalid and
Bizzoni, Yuri},
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.13",
pages = "132--139",
abstract = "This paper presents two significant contributions: First, it introduces a novel dataset of 19th-century Latin American newspaper texts, addressing a critical gap in specialized corpora for historical and linguistic analysis in this region. Second, it develops a flexible framework that utilizes a Large Language Model for OCR error correction and linguistic surface form detection in digitized corpora. This semi-automated framework is adaptable to various contexts and datasets and is applied to the newly created dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="manrique-gomez-etal-2024-historical">
<titleInfo>
<title>Historical Ink: 19th Century Latin American Spanish Newspaper Corpus with LLM OCR Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Manrique-Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="family">Montes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Rodriguez Herrera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruben</namePart>
<namePart type="family">Manrique</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents two significant contributions: First, it introduces a novel dataset of 19th-century Latin American newspaper texts, addressing a critical gap in specialized corpora for historical and linguistic analysis in this region. Second, it develops a flexible framework that utilizes a Large Language Model for OCR error correction and linguistic surface form detection in digitized corpora. This semi-automated framework is adaptable to various contexts and datasets and is applied to the newly created dataset.</abstract>
<identifier type="citekey">manrique-gomez-etal-2024-historical</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4dh-1.13</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>132</start>
<end>139</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Historical Ink: 19th Century Latin American Spanish Newspaper Corpus with LLM OCR Correction
%A Manrique-Gomez, Laura
%A Montes, Tony
%A Rodriguez Herrera, Arturo
%A Manrique, Ruben
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Miyagawa, So
%Y Alnajjar, Khalid
%Y Bizzoni, Yuri
%S Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, USA
%F manrique-gomez-etal-2024-historical
%X This paper presents two significant contributions: First, it introduces a novel dataset of 19th-century Latin American newspaper texts, addressing a critical gap in specialized corpora for historical and linguistic analysis in this region. Second, it develops a flexible framework that utilizes a Large Language Model for OCR error correction and linguistic surface form detection in digitized corpora. This semi-automated framework is adaptable to various contexts and datasets and is applied to the newly created dataset.
%U https://aclanthology.org/2024.nlp4dh-1.13
%P 132-139
Markdown (Informal)
[Historical Ink: 19th Century Latin American Spanish Newspaper Corpus with LLM OCR Correction](https://aclanthology.org/2024.nlp4dh-1.13) (Manrique-Gomez et al., NLP4DH 2024)
ACL