@article{esmail-etal-2022-much,
title = "How Much Does Lookahead Matter for Disambiguation? Partial {A}rabic Diacritization Case Study",
author = "Esmail, Saeed and
Bar, Kfir and
Dershowitz, Nachum",
journal = "Computational Linguistics",
volume = "48",
number = "4",
month = dec,
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.cl-4.20",
doi = "10.1162/coli_a_00456",
pages = "1103--1123",
abstract = {We suggest a model for partial diacritization of deep orthographies. We focus on Arabic, where the optional indication of selected vowels by means of diacritics can resolve ambiguity and improve readability. Our partial diacritizer restores short vowels only when they contribute to the ease of understandability during reading a given running text. The idea is to identify those uncertainties of absent vowels that require the reader to look ahead to disambiguate. To achieve this, two independent neural networks are used for predicting diacritics, one that takes the entire sentence as input and another that considers only the text that has been read thus far. Partial diacritization is then determined by retaining precisely those vowels on which the two networks disagree, preferring the reading based on consideration of the whole sentence over the more na{\"\i}ve reading-order diacritization. For evaluation, we prepared a new dataset of Arabic texts with both full and partial vowelization. In addition to facilitating readability, we find that our partial diacritizer improves translation quality compared either to their total absence or to random selection. Lastly, we study the benefit of knowing the text that follows the word in focus toward the restoration of short vowels during reading, and we measure the degree to which lookahead contributes to resolving ambiguities encountered while reading. L{'}Herbelot had asserted, that the most ancient Korans, written in the Cufic character, had no vowel points; and that these were first invented by Jahia{--}ben Jamer, who died in the 127th year of the Hegira. {``}Toderini{'}s History of Turkish Literature,{''} Analytical Review (1789)},
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="esmail-etal-2022-much">
<titleInfo>
<title>How Much Does Lookahead Matter for Disambiguation? Partial Arabic Diacritization Case Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saeed</namePart>
<namePart type="family">Esmail</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kfir</namePart>
<namePart type="family">Bar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nachum</namePart>
<namePart type="family">Dershowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We suggest a model for partial diacritization of deep orthographies. We focus on Arabic, where the optional indication of selected vowels by means of diacritics can resolve ambiguity and improve readability. Our partial diacritizer restores short vowels only when they contribute to the ease of understandability during reading a given running text. The idea is to identify those uncertainties of absent vowels that require the reader to look ahead to disambiguate. To achieve this, two independent neural networks are used for predicting diacritics, one that takes the entire sentence as input and another that considers only the text that has been read thus far. Partial diacritization is then determined by retaining precisely those vowels on which the two networks disagree, preferring the reading based on consideration of the whole sentence over the more naïve reading-order diacritization. For evaluation, we prepared a new dataset of Arabic texts with both full and partial vowelization. In addition to facilitating readability, we find that our partial diacritizer improves translation quality compared either to their total absence or to random selection. Lastly, we study the benefit of knowing the text that follows the word in focus toward the restoration of short vowels during reading, and we measure the degree to which lookahead contributes to resolving ambiguities encountered while reading. L’Herbelot had asserted, that the most ancient Korans, written in the Cufic character, had no vowel points; and that these were first invented by Jahia–ben Jamer, who died in the 127th year of the Hegira. “Toderini’s History of Turkish Literature,” Analytical Review (1789)</abstract>
<identifier type="citekey">esmail-etal-2022-much</identifier>
<identifier type="doi">10.1162/coli_a_00456</identifier>
<location>
<url>https://aclanthology.org/2022.cl-4.20</url>
</location>
<part>
<date>2022-12</date>
<detail type="volume"><number>48</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>1103</start>
<end>1123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T How Much Does Lookahead Matter for Disambiguation? Partial Arabic Diacritization Case Study
%A Esmail, Saeed
%A Bar, Kfir
%A Dershowitz, Nachum
%J Computational Linguistics
%D 2022
%8 December
%V 48
%N 4
%I MIT Press
%C Cambridge, MA
%F esmail-etal-2022-much
%X We suggest a model for partial diacritization of deep orthographies. We focus on Arabic, where the optional indication of selected vowels by means of diacritics can resolve ambiguity and improve readability. Our partial diacritizer restores short vowels only when they contribute to the ease of understandability during reading a given running text. The idea is to identify those uncertainties of absent vowels that require the reader to look ahead to disambiguate. To achieve this, two independent neural networks are used for predicting diacritics, one that takes the entire sentence as input and another that considers only the text that has been read thus far. Partial diacritization is then determined by retaining precisely those vowels on which the two networks disagree, preferring the reading based on consideration of the whole sentence over the more naïve reading-order diacritization. For evaluation, we prepared a new dataset of Arabic texts with both full and partial vowelization. In addition to facilitating readability, we find that our partial diacritizer improves translation quality compared either to their total absence or to random selection. Lastly, we study the benefit of knowing the text that follows the word in focus toward the restoration of short vowels during reading, and we measure the degree to which lookahead contributes to resolving ambiguities encountered while reading. L’Herbelot had asserted, that the most ancient Korans, written in the Cufic character, had no vowel points; and that these were first invented by Jahia–ben Jamer, who died in the 127th year of the Hegira. “Toderini’s History of Turkish Literature,” Analytical Review (1789)
%R 10.1162/coli_a_00456
%U https://aclanthology.org/2022.cl-4.20
%U https://doi.org/10.1162/coli_a_00456
%P 1103-1123
Markdown (Informal)
[How Much Does Lookahead Matter for Disambiguation? Partial Arabic Diacritization Case Study](https://aclanthology.org/2022.cl-4.20) (Esmail et al., CL 2022)
ACL