@inproceedings{palmero-aprosio-etal-2022-bertoldo,
title = "{BERT}oldo, the Historical {BERT} for {I}talian",
author = "Palmero Aprosio, Alessio and
Menini, Stefano and
Tonelli, Sara",
editor = "Sprugnoli, Rachele and
Passarotti, Marco",
booktitle = "Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lt4hala-1.10",
pages = "68--72",
abstract = "Recent works in historical language processing have shown that transformer-based models can be successfully created using historical corpora, and that using them for analysing and classifying data from the past can be beneficial compared to standard transformer models. This has led to the creation of BERT-like models for different languages trained with digital repositories from the past. In this work we introduce the Italian version of historical BERT, which we call BERToldo. We evaluate the model on the task of PoS-tagging Dante Alighieri{'}s works, considering not only the tagger performance but also the model size and the time needed to train it. We also address the problem of duplicated data, which is rather common for languages with a limited availability of historical corpora. We show that deduplication reduces training time without affecting performance. The model and its smaller versions are all made available to the research community.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palmero-aprosio-etal-2022-bertoldo">
<titleInfo>
<title>BERToldo, the Historical BERT for Italian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alessio</namePart>
<namePart type="family">Palmero Aprosio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefano</namePart>
<namePart type="family">Menini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Tonelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Passarotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent works in historical language processing have shown that transformer-based models can be successfully created using historical corpora, and that using them for analysing and classifying data from the past can be beneficial compared to standard transformer models. This has led to the creation of BERT-like models for different languages trained with digital repositories from the past. In this work we introduce the Italian version of historical BERT, which we call BERToldo. We evaluate the model on the task of PoS-tagging Dante Alighieri’s works, considering not only the tagger performance but also the model size and the time needed to train it. We also address the problem of duplicated data, which is rather common for languages with a limited availability of historical corpora. We show that deduplication reduces training time without affecting performance. The model and its smaller versions are all made available to the research community.</abstract>
<identifier type="citekey">palmero-aprosio-etal-2022-bertoldo</identifier>
<location>
<url>https://aclanthology.org/2022.lt4hala-1.10</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>68</start>
<end>72</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERToldo, the Historical BERT for Italian
%A Palmero Aprosio, Alessio
%A Menini, Stefano
%A Tonelli, Sara
%Y Sprugnoli, Rachele
%Y Passarotti, Marco
%S Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F palmero-aprosio-etal-2022-bertoldo
%X Recent works in historical language processing have shown that transformer-based models can be successfully created using historical corpora, and that using them for analysing and classifying data from the past can be beneficial compared to standard transformer models. This has led to the creation of BERT-like models for different languages trained with digital repositories from the past. In this work we introduce the Italian version of historical BERT, which we call BERToldo. We evaluate the model on the task of PoS-tagging Dante Alighieri’s works, considering not only the tagger performance but also the model size and the time needed to train it. We also address the problem of duplicated data, which is rather common for languages with a limited availability of historical corpora. We show that deduplication reduces training time without affecting performance. The model and its smaller versions are all made available to the research community.
%U https://aclanthology.org/2022.lt4hala-1.10
%P 68-72
Markdown (Informal)
[BERToldo, the Historical BERT for Italian](https://aclanthology.org/2022.lt4hala-1.10) (Palmero Aprosio et al., LT4HALA 2022)
ACL
- Alessio Palmero Aprosio, Stefano Menini, and Sara Tonelli. 2022. BERToldo, the Historical BERT for Italian. In Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages, pages 68–72, Marseille, France. European Language Resources Association.