@inproceedings{naaijer-etal-2023-transformer,
title = "A Transformer-based parser for {S}yriac morphology",
author = "Naaijer, Martijn and
Sikkel, Constantijn and
Coeckelbergs, Mathias and
Attema, Jisk and
Van Peursen, Willem Th.",
editor = "Anderson, Adam and
Gordin, Shai and
Li, Bin and
Liu, Yudong and
Passarotti, Marco C.",
booktitle = "Proceedings of the Ancient Language Processing Workshop",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.alp-1.3",
pages = "23--29",
abstract = "In this project we train a Transformer-based model from scratch, with the goal of parsing the morphology of Ancient Syriac texts as accurately as possible. Syriac is still a low resource language, only a relatively small training set was available. Therefore, the training set was expanded by adding Biblical Hebrew data to it. Five different experiments were done: the model was trained on Syriac data only, it was trained with mixed Syriac and (un)vocalized Hebrew data, and it was pretrained on (un)vocalized Hebrew data and then finetuned on Syriac data. The models trained on Hebrew and Syriac data consistently outperform the models trained on Syriac data only. This shows, that the differences between Syriac and Hebrew are small enough that it is worth adding Hebrew data to train the model for parsing Syriac morphology. Training models on different languages is an important trend in NLP, we show that this works well for relatively small datasets of Syriac and Hebrew.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="naaijer-etal-2023-transformer">
<titleInfo>
<title>A Transformer-based parser for Syriac morphology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martijn</namePart>
<namePart type="family">Naaijer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantijn</namePart>
<namePart type="family">Sikkel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="family">Coeckelbergs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jisk</namePart>
<namePart type="family">Attema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willem</namePart>
<namePart type="given">Th.</namePart>
<namePart type="family">Van Peursen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ancient Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shai</namePart>
<namePart type="family">Gordin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yudong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Passarotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this project we train a Transformer-based model from scratch, with the goal of parsing the morphology of Ancient Syriac texts as accurately as possible. Syriac is still a low resource language, only a relatively small training set was available. Therefore, the training set was expanded by adding Biblical Hebrew data to it. Five different experiments were done: the model was trained on Syriac data only, it was trained with mixed Syriac and (un)vocalized Hebrew data, and it was pretrained on (un)vocalized Hebrew data and then finetuned on Syriac data. The models trained on Hebrew and Syriac data consistently outperform the models trained on Syriac data only. This shows, that the differences between Syriac and Hebrew are small enough that it is worth adding Hebrew data to train the model for parsing Syriac morphology. Training models on different languages is an important trend in NLP, we show that this works well for relatively small datasets of Syriac and Hebrew.</abstract>
<identifier type="citekey">naaijer-etal-2023-transformer</identifier>
<location>
<url>https://aclanthology.org/2023.alp-1.3</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>23</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Transformer-based parser for Syriac morphology
%A Naaijer, Martijn
%A Sikkel, Constantijn
%A Coeckelbergs, Mathias
%A Attema, Jisk
%A Van Peursen, Willem Th.
%Y Anderson, Adam
%Y Gordin, Shai
%Y Li, Bin
%Y Liu, Yudong
%Y Passarotti, Marco C.
%S Proceedings of the Ancient Language Processing Workshop
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F naaijer-etal-2023-transformer
%X In this project we train a Transformer-based model from scratch, with the goal of parsing the morphology of Ancient Syriac texts as accurately as possible. Syriac is still a low resource language, only a relatively small training set was available. Therefore, the training set was expanded by adding Biblical Hebrew data to it. Five different experiments were done: the model was trained on Syriac data only, it was trained with mixed Syriac and (un)vocalized Hebrew data, and it was pretrained on (un)vocalized Hebrew data and then finetuned on Syriac data. The models trained on Hebrew and Syriac data consistently outperform the models trained on Syriac data only. This shows, that the differences between Syriac and Hebrew are small enough that it is worth adding Hebrew data to train the model for parsing Syriac morphology. Training models on different languages is an important trend in NLP, we show that this works well for relatively small datasets of Syriac and Hebrew.
%U https://aclanthology.org/2023.alp-1.3
%P 23-29
Markdown (Informal)
[A Transformer-based parser for Syriac morphology](https://aclanthology.org/2023.alp-1.3) (Naaijer et al., ALP-WS 2023)
ACL
- Martijn Naaijer, Constantijn Sikkel, Mathias Coeckelbergs, Jisk Attema, and Willem Th. Van Peursen. 2023. A Transformer-based parser for Syriac morphology. In Proceedings of the Ancient Language Processing Workshop, pages 23–29, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.