@inproceedings{rubino-etal-2024-normalizing,
title = "Normalizing without Modernizing: Keeping Historical Wordforms of {M}iddle {F}rench while Reducing Spelling Variants",
author = "Rubino, Raphael and
Gerlach, Johanna and
Mutal, Jonathan and
Bouillon, Pierrette",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.215",
doi = "10.18653/v1/2024.findings-naacl.215",
pages = "3394--3402",
abstract = "Conservation of historical documents benefits from computational methods by alleviating the manual labor related to digitization and modernization of textual content. Languages usually evolve over time and keeping historical wordforms is crucial for diachronic studies and digital humanities. However, spelling conventions did not necessarily exist when texts were originally written and orthographic variations are commonly observed depending on scribes and time periods. In this study, we propose to automatically normalize orthographic wordforms found in historical archives written in Middle French during the 16th century without fully modernizing textual content. We leverage pre-trained models in a low resource setting based on a manually curated parallel corpus and produce additional resources with artificial data generation approaches. Results show that causal language models and knowledge distillation improve over a strong baseline, thus validating the proposed methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubino-etal-2024-normalizing">
<titleInfo>
<title>Normalizing without Modernizing: Keeping Historical Wordforms of Middle French while Reducing Spelling Variants</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Rubino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Gerlach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Mutal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierrette</namePart>
<namePart type="family">Bouillon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Conservation of historical documents benefits from computational methods by alleviating the manual labor related to digitization and modernization of textual content. Languages usually evolve over time and keeping historical wordforms is crucial for diachronic studies and digital humanities. However, spelling conventions did not necessarily exist when texts were originally written and orthographic variations are commonly observed depending on scribes and time periods. In this study, we propose to automatically normalize orthographic wordforms found in historical archives written in Middle French during the 16th century without fully modernizing textual content. We leverage pre-trained models in a low resource setting based on a manually curated parallel corpus and produce additional resources with artificial data generation approaches. Results show that causal language models and knowledge distillation improve over a strong baseline, thus validating the proposed methods.</abstract>
<identifier type="citekey">rubino-etal-2024-normalizing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.215</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.215</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3394</start>
<end>3402</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Normalizing without Modernizing: Keeping Historical Wordforms of Middle French while Reducing Spelling Variants
%A Rubino, Raphael
%A Gerlach, Johanna
%A Mutal, Jonathan
%A Bouillon, Pierrette
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F rubino-etal-2024-normalizing
%X Conservation of historical documents benefits from computational methods by alleviating the manual labor related to digitization and modernization of textual content. Languages usually evolve over time and keeping historical wordforms is crucial for diachronic studies and digital humanities. However, spelling conventions did not necessarily exist when texts were originally written and orthographic variations are commonly observed depending on scribes and time periods. In this study, we propose to automatically normalize orthographic wordforms found in historical archives written in Middle French during the 16th century without fully modernizing textual content. We leverage pre-trained models in a low resource setting based on a manually curated parallel corpus and produce additional resources with artificial data generation approaches. Results show that causal language models and knowledge distillation improve over a strong baseline, thus validating the proposed methods.
%R 10.18653/v1/2024.findings-naacl.215
%U https://aclanthology.org/2024.findings-naacl.215
%U https://doi.org/10.18653/v1/2024.findings-naacl.215
%P 3394-3402
Markdown (Informal)
[Normalizing without Modernizing: Keeping Historical Wordforms of Middle French while Reducing Spelling Variants](https://aclanthology.org/2024.findings-naacl.215) (Rubino et al., Findings 2024)
ACL