@inproceedings{schoffel-etal-2025-modern,
title = "Modern Models, Medieval Texts: A {POS} Tagging Study of Old {O}ccitan",
author = {Sch{\"o}ffel, Matthias and
Wiedner, Marinus and
Garces Arias, Esteban and
Ruppert, Paula and
Heumann, Christian and
A{\ss}enmacher, Matthias},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nlp4dh-1.30/",
doi = "10.18653/v1/2025.nlp4dh-1.30",
pages = "334--349",
ISBN = "979-8-89176-234-3",
abstract = "Large language models (LLMs) have demonstrated remarkable capabilities in natural language processing, yet their effectiveness in handling historical languages remains largely unexplored. This study examines the performance of open-source LLMs in part-of-speech (POS) tagging for Old Occitan, a historical language characterized by non-standardized orthography and significant diachronic variation. Through comparative analysis of two distinct corpora{---}hagiographical and medical texts{---}we evaluate how current models handle the inherent challenges of processing a low-resource historical language. Our findings demonstrate critical limitations in LLM performance when confronted with extreme orthographic and syntactic variability. We provide detailed error analysis and specific recommendations for improving model performance in historical language processing. This research advances our understanding of LLM capabilities in challenging linguistic contexts while offering practical insights for both computational linguistics and historical language studies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schoffel-etal-2025-modern">
<titleInfo>
<title>Modern Models, Medieval Texts: A POS Tagging Study of Old Occitan</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Schöffel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marinus</namePart>
<namePart type="family">Wiedner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esteban</namePart>
<namePart type="family">Garces Arias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paula</namePart>
<namePart type="family">Ruppert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Heumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Aßenmacher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-234-3</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have demonstrated remarkable capabilities in natural language processing, yet their effectiveness in handling historical languages remains largely unexplored. This study examines the performance of open-source LLMs in part-of-speech (POS) tagging for Old Occitan, a historical language characterized by non-standardized orthography and significant diachronic variation. Through comparative analysis of two distinct corpora—hagiographical and medical texts—we evaluate how current models handle the inherent challenges of processing a low-resource historical language. Our findings demonstrate critical limitations in LLM performance when confronted with extreme orthographic and syntactic variability. We provide detailed error analysis and specific recommendations for improving model performance in historical language processing. This research advances our understanding of LLM capabilities in challenging linguistic contexts while offering practical insights for both computational linguistics and historical language studies.</abstract>
<identifier type="citekey">schoffel-etal-2025-modern</identifier>
<identifier type="doi">10.18653/v1/2025.nlp4dh-1.30</identifier>
<location>
<url>https://aclanthology.org/2025.nlp4dh-1.30/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>334</start>
<end>349</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modern Models, Medieval Texts: A POS Tagging Study of Old Occitan
%A Schöffel, Matthias
%A Wiedner, Marinus
%A Garces Arias, Esteban
%A Ruppert, Paula
%A Heumann, Christian
%A Aßenmacher, Matthias
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Bizzoni, Yuri
%Y Miyagawa, So
%Y Alnajjar, Khalid
%S Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-234-3
%F schoffel-etal-2025-modern
%X Large language models (LLMs) have demonstrated remarkable capabilities in natural language processing, yet their effectiveness in handling historical languages remains largely unexplored. This study examines the performance of open-source LLMs in part-of-speech (POS) tagging for Old Occitan, a historical language characterized by non-standardized orthography and significant diachronic variation. Through comparative analysis of two distinct corpora—hagiographical and medical texts—we evaluate how current models handle the inherent challenges of processing a low-resource historical language. Our findings demonstrate critical limitations in LLM performance when confronted with extreme orthographic and syntactic variability. We provide detailed error analysis and specific recommendations for improving model performance in historical language processing. This research advances our understanding of LLM capabilities in challenging linguistic contexts while offering practical insights for both computational linguistics and historical language studies.
%R 10.18653/v1/2025.nlp4dh-1.30
%U https://aclanthology.org/2025.nlp4dh-1.30/
%U https://doi.org/10.18653/v1/2025.nlp4dh-1.30
%P 334-349
Markdown (Informal)
[Modern Models, Medieval Texts: A POS Tagging Study of Old Occitan](https://aclanthology.org/2025.nlp4dh-1.30/) (Schöffel et al., NLP4DH 2025)
ACL
- Matthias Schöffel, Marinus Wiedner, Esteban Garces Arias, Paula Ruppert, Christian Heumann, and Matthias Aßenmacher. 2025. Modern Models, Medieval Texts: A POS Tagging Study of Old Occitan. In Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities, pages 334–349, Albuquerque, USA. Association for Computational Linguistics.