@inproceedings{lendvai-etal-2025-instruction,
title = "Instruction Finetuning to Attribute Language Stage, Dialect, and Provenance Region to Historical {C}hurch {S}lavic Texts",
author = "Lendvai, Piroska and
Reichel, Uwe and
Jouravel, Anna and
Rabus, Achim and
Renje, Elena",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.76/",
pages = "654--662",
abstract = "Our study addresses domain-specific text provenance classification for the historical Church Slavic language. The downstream task is to attribute the language stage and its dialectal and regional varieties to texts compiled from newly curated sources, including digitally unpublished manuscripts, in addition to established Church Slavic resources from the Universal Dependencies Treebank. We aim to harmonize previously used tag sets pertaining to textual provenance, and construct a new, hierarchical, multi-layer provenance labeling scheme. For the classification task, we finetune Vikhr (Nikolich et al., 2004), a generative LLM with knowledge of modern Russian, with the instruction to generate labels to classify the provenance of sentence-level text units. Besides gold standard manuscript transcriptions, we test the finetuned model on character-corrupted data that emulate the quality of noisy, handwritten text recognition material. The experiments show that the Vikhr base model has low provenance attribution knowledge of Church Slavic, whereas our finetuned model achieves above .9 F-scores on Language stage labeling and Dialect labeling, and above .8 F-score on generating the label that jointly classifies all three provenance layers. The task of classifying the fine-grained geographical region from which a manuscript originates proves harder (but still performs above .8), and is negatively impacted by character level noise injection."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lendvai-etal-2025-instruction">
<titleInfo>
<title>Instruction Finetuning to Attribute Language Stage, Dialect, and Provenance Region to Historical Church Slavic Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piroska</namePart>
<namePart type="family">Lendvai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uwe</namePart>
<namePart type="family">Reichel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Jouravel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Achim</namePart>
<namePart type="family">Rabus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Renje</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Our study addresses domain-specific text provenance classification for the historical Church Slavic language. The downstream task is to attribute the language stage and its dialectal and regional varieties to texts compiled from newly curated sources, including digitally unpublished manuscripts, in addition to established Church Slavic resources from the Universal Dependencies Treebank. We aim to harmonize previously used tag sets pertaining to textual provenance, and construct a new, hierarchical, multi-layer provenance labeling scheme. For the classification task, we finetune Vikhr (Nikolich et al., 2004), a generative LLM with knowledge of modern Russian, with the instruction to generate labels to classify the provenance of sentence-level text units. Besides gold standard manuscript transcriptions, we test the finetuned model on character-corrupted data that emulate the quality of noisy, handwritten text recognition material. The experiments show that the Vikhr base model has low provenance attribution knowledge of Church Slavic, whereas our finetuned model achieves above .9 F-scores on Language stage labeling and Dialect labeling, and above .8 F-score on generating the label that jointly classifies all three provenance layers. The task of classifying the fine-grained geographical region from which a manuscript originates proves harder (but still performs above .8), and is negatively impacted by character level noise injection.</abstract>
<identifier type="citekey">lendvai-etal-2025-instruction</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.76/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>654</start>
<end>662</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Instruction Finetuning to Attribute Language Stage, Dialect, and Provenance Region to Historical Church Slavic Texts
%A Lendvai, Piroska
%A Reichel, Uwe
%A Jouravel, Anna
%A Rabus, Achim
%A Renje, Elena
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F lendvai-etal-2025-instruction
%X Our study addresses domain-specific text provenance classification for the historical Church Slavic language. The downstream task is to attribute the language stage and its dialectal and regional varieties to texts compiled from newly curated sources, including digitally unpublished manuscripts, in addition to established Church Slavic resources from the Universal Dependencies Treebank. We aim to harmonize previously used tag sets pertaining to textual provenance, and construct a new, hierarchical, multi-layer provenance labeling scheme. For the classification task, we finetune Vikhr (Nikolich et al., 2004), a generative LLM with knowledge of modern Russian, with the instruction to generate labels to classify the provenance of sentence-level text units. Besides gold standard manuscript transcriptions, we test the finetuned model on character-corrupted data that emulate the quality of noisy, handwritten text recognition material. The experiments show that the Vikhr base model has low provenance attribution knowledge of Church Slavic, whereas our finetuned model achieves above .9 F-scores on Language stage labeling and Dialect labeling, and above .8 F-score on generating the label that jointly classifies all three provenance layers. The task of classifying the fine-grained geographical region from which a manuscript originates proves harder (but still performs above .8), and is negatively impacted by character level noise injection.
%U https://aclanthology.org/2025.ranlp-1.76/
%P 654-662
Markdown (Informal)
[Instruction Finetuning to Attribute Language Stage, Dialect, and Provenance Region to Historical Church Slavic Texts](https://aclanthology.org/2025.ranlp-1.76/) (Lendvai et al., RANLP 2025)
ACL
- Piroska Lendvai, Uwe Reichel, Anna Jouravel, Achim Rabus, and Elena Renje. 2025. Instruction Finetuning to Attribute Language Stage, Dialect, and Provenance Region to Historical Church Slavic Texts. In Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era, pages 654–662, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.