@inproceedings{lendvai-etal-2023-domain-adapting,
title = "Domain-Adapting {BERT} for Attributing Manuscript, Century and Region in Pre-{M}odern {S}lavic Texts",
author = "Lendvai, Piroska and
Reichel, Uwe and
Jouravel, Anna and
Rabus, Achim and
Renje, Elena",
editor = "Tahmasebi, Nina and
Montariol, Syrielle and
Dubossarsky, Haim and
Kutuzov, Andrey and
Hengchen, Simon and
Alfter, David and
Periti, Francesco and
Cassotti, Pierluigi",
booktitle = "Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.lchange-1.2/",
doi = "10.18653/v1/2023.lchange-1.2",
pages = "15--21",
abstract = "Our study presents a stratified dataset compiled from six different Slavic bodies of text, for cross-linguistic and diachronic analyses of Slavic Pre-Modern language variants. We demonstrate unsupervised domain adaptation and supervised finetuning of BERT on these low-resource, historical Slavic variants, for the purposes of provenance attribution in terms of three downstream tasks: manuscript, century and copying region classification.The data compilation aims to capture diachronic as well as regional language variation and change: the texts were written in the course of roughly a millennium, incorporating language variants from the High Middle Ages to the Early Modern Period, and originate from a variety of geographic regions. Mechanisms of language change in relatively small portions of such data have been inspected, analyzed and typologized by Slavists manually; our contribution aims to investigate the extent to which the BERT transformer architecture and pretrained models can benefit this process. Using these datasets for domain adaptation, we could attribute temporal, geographical and manuscript origin on the level of text snippets with high F-scores. We also conducted a qualitative analysis of the models' misclassifications."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lendvai-etal-2023-domain-adapting">
<titleInfo>
<title>Domain-Adapting BERT for Attributing Manuscript, Century and Region in Pre-Modern Slavic Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piroska</namePart>
<namePart type="family">Lendvai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uwe</namePart>
<namePart type="family">Reichel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Jouravel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Achim</namePart>
<namePart type="family">Rabus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Renje</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Tahmasebi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haim</namePart>
<namePart type="family">Dubossarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="family">Kutuzov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Hengchen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Alfter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Periti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierluigi</namePart>
<namePart type="family">Cassotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Our study presents a stratified dataset compiled from six different Slavic bodies of text, for cross-linguistic and diachronic analyses of Slavic Pre-Modern language variants. We demonstrate unsupervised domain adaptation and supervised finetuning of BERT on these low-resource, historical Slavic variants, for the purposes of provenance attribution in terms of three downstream tasks: manuscript, century and copying region classification.The data compilation aims to capture diachronic as well as regional language variation and change: the texts were written in the course of roughly a millennium, incorporating language variants from the High Middle Ages to the Early Modern Period, and originate from a variety of geographic regions. Mechanisms of language change in relatively small portions of such data have been inspected, analyzed and typologized by Slavists manually; our contribution aims to investigate the extent to which the BERT transformer architecture and pretrained models can benefit this process. Using these datasets for domain adaptation, we could attribute temporal, geographical and manuscript origin on the level of text snippets with high F-scores. We also conducted a qualitative analysis of the models’ misclassifications.</abstract>
<identifier type="citekey">lendvai-etal-2023-domain-adapting</identifier>
<identifier type="doi">10.18653/v1/2023.lchange-1.2</identifier>
<location>
<url>https://aclanthology.org/2023.lchange-1.2/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>15</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Domain-Adapting BERT for Attributing Manuscript, Century and Region in Pre-Modern Slavic Texts
%A Lendvai, Piroska
%A Reichel, Uwe
%A Jouravel, Anna
%A Rabus, Achim
%A Renje, Elena
%Y Tahmasebi, Nina
%Y Montariol, Syrielle
%Y Dubossarsky, Haim
%Y Kutuzov, Andrey
%Y Hengchen, Simon
%Y Alfter, David
%Y Periti, Francesco
%Y Cassotti, Pierluigi
%S Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F lendvai-etal-2023-domain-adapting
%X Our study presents a stratified dataset compiled from six different Slavic bodies of text, for cross-linguistic and diachronic analyses of Slavic Pre-Modern language variants. We demonstrate unsupervised domain adaptation and supervised finetuning of BERT on these low-resource, historical Slavic variants, for the purposes of provenance attribution in terms of three downstream tasks: manuscript, century and copying region classification.The data compilation aims to capture diachronic as well as regional language variation and change: the texts were written in the course of roughly a millennium, incorporating language variants from the High Middle Ages to the Early Modern Period, and originate from a variety of geographic regions. Mechanisms of language change in relatively small portions of such data have been inspected, analyzed and typologized by Slavists manually; our contribution aims to investigate the extent to which the BERT transformer architecture and pretrained models can benefit this process. Using these datasets for domain adaptation, we could attribute temporal, geographical and manuscript origin on the level of text snippets with high F-scores. We also conducted a qualitative analysis of the models’ misclassifications.
%R 10.18653/v1/2023.lchange-1.2
%U https://aclanthology.org/2023.lchange-1.2/
%U https://doi.org/10.18653/v1/2023.lchange-1.2
%P 15-21
Markdown (Informal)
[Domain-Adapting BERT for Attributing Manuscript, Century and Region in Pre-Modern Slavic Texts](https://aclanthology.org/2023.lchange-1.2/) (Lendvai et al., LChange 2023)
ACL