@inproceedings{heinecke-2024-udparse,
title = "{UDP}arse @ {SIGTYP} 2024 Shared Task : Modern Language Models for Historical Languages",
author = "Heinecke, Johannes",
editor = "Hahn, Michael and
Sorokin, Alexey and
Kumar, Ritesh and
Shcherbakov, Andreas and
Otmakhova, Yulia and
Yang, Jinrui and
Serikov, Oleg and
Rani, Priya and
Ponti, Edoardo M. and
Murado{\u{g}}lu, Saliha and
Gao, Rena and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = mar,
year = "2024",
address = "St. Julian's, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigtyp-1.17",
pages = "142--150",
abstract = "SIGTYP{'}s Shared Task on Word Embedding Evaluation for Ancient and Historical Languages was proposed in two variants, constrained or unconstrained. Whereas the constrained variant disallowed any other data to train embeddings or models than the data provided, the unconstrained variant did not have these limits. We participated in the five tasks of the unconstrained variant and came out first. The tasks were the prediction of part-of-speech, lemmas and morphological features and filling masked words and masked characters on 16 historical languages. We decided to use a dependency parser and train the data using an underlying pretrained transformer model to predict part-of-speech tags, lemmas, and morphological features. For predicting masked words, we used multilingual distilBERT (with rather bad results). In order to predict masked characters, our language model is extremely small: it is a model of 5-gram frequencies, obtained by reading the available training data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="heinecke-2024-udparse">
<titleInfo>
<title>UDParse @ SIGTYP 2024 Shared Task : Modern Language Models for Historical Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Heinecke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Shcherbakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Otmakhova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinrui</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priya</namePart>
<namePart type="family">Rani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Ponti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saliha</namePart>
<namePart type="family">Muradoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rena</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>SIGTYP’s Shared Task on Word Embedding Evaluation for Ancient and Historical Languages was proposed in two variants, constrained or unconstrained. Whereas the constrained variant disallowed any other data to train embeddings or models than the data provided, the unconstrained variant did not have these limits. We participated in the five tasks of the unconstrained variant and came out first. The tasks were the prediction of part-of-speech, lemmas and morphological features and filling masked words and masked characters on 16 historical languages. We decided to use a dependency parser and train the data using an underlying pretrained transformer model to predict part-of-speech tags, lemmas, and morphological features. For predicting masked words, we used multilingual distilBERT (with rather bad results). In order to predict masked characters, our language model is extremely small: it is a model of 5-gram frequencies, obtained by reading the available training data.</abstract>
<identifier type="citekey">heinecke-2024-udparse</identifier>
<location>
<url>https://aclanthology.org/2024.sigtyp-1.17</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>142</start>
<end>150</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UDParse @ SIGTYP 2024 Shared Task : Modern Language Models for Historical Languages
%A Heinecke, Johannes
%Y Hahn, Michael
%Y Sorokin, Alexey
%Y Kumar, Ritesh
%Y Shcherbakov, Andreas
%Y Otmakhova, Yulia
%Y Yang, Jinrui
%Y Serikov, Oleg
%Y Rani, Priya
%Y Ponti, Edoardo M.
%Y Muradoğlu, Saliha
%Y Gao, Rena
%Y Cotterell, Ryan
%Y Vylomova, Ekaterina
%S Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F heinecke-2024-udparse
%X SIGTYP’s Shared Task on Word Embedding Evaluation for Ancient and Historical Languages was proposed in two variants, constrained or unconstrained. Whereas the constrained variant disallowed any other data to train embeddings or models than the data provided, the unconstrained variant did not have these limits. We participated in the five tasks of the unconstrained variant and came out first. The tasks were the prediction of part-of-speech, lemmas and morphological features and filling masked words and masked characters on 16 historical languages. We decided to use a dependency parser and train the data using an underlying pretrained transformer model to predict part-of-speech tags, lemmas, and morphological features. For predicting masked words, we used multilingual distilBERT (with rather bad results). In order to predict masked characters, our language model is extremely small: it is a model of 5-gram frequencies, obtained by reading the available training data.
%U https://aclanthology.org/2024.sigtyp-1.17
%P 142-150
Markdown (Informal)
[UDParse @ SIGTYP 2024 Shared Task : Modern Language Models for Historical Languages](https://aclanthology.org/2024.sigtyp-1.17) (Heinecke, SIGTYP-WS 2024)
ACL