@inproceedings{lazar-etal-2021-filling,
title = "Filling the Gaps in {A}ncient {A}kkadian Texts: A Masked Language Modelling Approach",
author = "Lazar, Koren and
Saret, Benny and
Yehudai, Asaf and
Horowitz, Wayne and
Wasserman, Nathan and
Stanovsky, Gabriel",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.384",
doi = "10.18653/v1/2021.emnlp-main.384",
pages = "4682--4691",
abstract = "We present models which complete missing text given transliterations of ancient Mesopotamian documents, originally written on cuneiform clay tablets (2500 BCE - 100 CE). Due to the tablets{'} deterioration, scholars often rely on contextual cues to manually fill in missing parts in the text in a subjective and time-consuming process. We identify that this challenge can be formulated as a masked language modelling task, used mostly as a pretraining objective for contextualized language models. Following, we develop several architectures focusing on the Akkadian language, the lingua franca of the time. We find that despite data scarcity (1M tokens) we can achieve state of the art performance on missing tokens prediction (89{\%} hit@5) using a greedy decoding scheme and pretraining on data from other languages and different time periods. Finally, we conduct human evaluations showing the applicability of our models in assisting experts to transcribe texts in extinct languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lazar-etal-2021-filling">
<titleInfo>
<title>Filling the Gaps in Ancient Akkadian Texts: A Masked Language Modelling Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Koren</namePart>
<namePart type="family">Lazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benny</namePart>
<namePart type="family">Saret</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asaf</namePart>
<namePart type="family">Yehudai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wayne</namePart>
<namePart type="family">Horowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Wasserman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present models which complete missing text given transliterations of ancient Mesopotamian documents, originally written on cuneiform clay tablets (2500 BCE - 100 CE). Due to the tablets’ deterioration, scholars often rely on contextual cues to manually fill in missing parts in the text in a subjective and time-consuming process. We identify that this challenge can be formulated as a masked language modelling task, used mostly as a pretraining objective for contextualized language models. Following, we develop several architectures focusing on the Akkadian language, the lingua franca of the time. We find that despite data scarcity (1M tokens) we can achieve state of the art performance on missing tokens prediction (89% hit@5) using a greedy decoding scheme and pretraining on data from other languages and different time periods. Finally, we conduct human evaluations showing the applicability of our models in assisting experts to transcribe texts in extinct languages.</abstract>
<identifier type="citekey">lazar-etal-2021-filling</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.384</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.384</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>4682</start>
<end>4691</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Filling the Gaps in Ancient Akkadian Texts: A Masked Language Modelling Approach
%A Lazar, Koren
%A Saret, Benny
%A Yehudai, Asaf
%A Horowitz, Wayne
%A Wasserman, Nathan
%A Stanovsky, Gabriel
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F lazar-etal-2021-filling
%X We present models which complete missing text given transliterations of ancient Mesopotamian documents, originally written on cuneiform clay tablets (2500 BCE - 100 CE). Due to the tablets’ deterioration, scholars often rely on contextual cues to manually fill in missing parts in the text in a subjective and time-consuming process. We identify that this challenge can be formulated as a masked language modelling task, used mostly as a pretraining objective for contextualized language models. Following, we develop several architectures focusing on the Akkadian language, the lingua franca of the time. We find that despite data scarcity (1M tokens) we can achieve state of the art performance on missing tokens prediction (89% hit@5) using a greedy decoding scheme and pretraining on data from other languages and different time periods. Finally, we conduct human evaluations showing the applicability of our models in assisting experts to transcribe texts in extinct languages.
%R 10.18653/v1/2021.emnlp-main.384
%U https://aclanthology.org/2021.emnlp-main.384
%U https://doi.org/10.18653/v1/2021.emnlp-main.384
%P 4682-4691
Markdown (Informal)
[Filling the Gaps in Ancient Akkadian Texts: A Masked Language Modelling Approach](https://aclanthology.org/2021.emnlp-main.384) (Lazar et al., EMNLP 2021)
ACL