@inproceedings{malaysha-etal-2023-context,
title = "Context-Gloss Augmentation for Improving {A}rabic Target Sense Verification",
author = "Malaysha, Sanad and
Jarrar, Mustafa and
Khalilia, Mohammed",
editor = "Rigau, German and
Bond, Francis and
Rademaker, Alexandre",
booktitle = "Proceedings of the 12th Global Wordnet Conference",
month = jan,
year = "2023",
address = "University of the Basque Country, Donostia - San Sebastian, Basque Country",
publisher = "Global Wordnet Association",
url = "https://aclanthology.org/2023.gwc-1.31",
pages = "254--262",
abstract = "Arabic language lacks semantic datasets and sense inventories. The most common semantically-labeled dataset for Arabic is the ArabGlossBERT, a relatively small dataset that consists of 167K context-gloss pairs (about 60K positive and 107K negative pairs), collected from Arabic dictionaries. This paper presents an enrichment to the ArabGlossBERT dataset, by augmenting it using (Arabic-English-Arabic) machine back-translation. Augmentation increased the dataset size to 352K pairs (149K positive and 203K negative pairs). We measure the impact of augmentation using different data configurations to fine-tune BERT on target sense verification (TSV) task. Overall, the accuracy ranges between 78{\%} to 84{\%} for different data configurations. Although our approach performed at par with the baseline, we did observe some improvements for some POS tags in some experiments. Furthermore, our fine-tuned models are trained on a larger dataset covering larger vocabulary and contexts. We provide an in-depth analysis of the accuracy for each part-of-speech (POS).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="malaysha-etal-2023-context">
<titleInfo>
<title>Context-Gloss Augmentation for Improving Arabic Target Sense Verification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sanad</namePart>
<namePart type="family">Malaysha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="family">Khalilia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Global Wordnet Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">German</namePart>
<namePart type="family">Rigau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Rademaker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Global Wordnet Association</publisher>
<place>
<placeTerm type="text">University of the Basque Country, Donostia - San Sebastian, Basque Country</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic language lacks semantic datasets and sense inventories. The most common semantically-labeled dataset for Arabic is the ArabGlossBERT, a relatively small dataset that consists of 167K context-gloss pairs (about 60K positive and 107K negative pairs), collected from Arabic dictionaries. This paper presents an enrichment to the ArabGlossBERT dataset, by augmenting it using (Arabic-English-Arabic) machine back-translation. Augmentation increased the dataset size to 352K pairs (149K positive and 203K negative pairs). We measure the impact of augmentation using different data configurations to fine-tune BERT on target sense verification (TSV) task. Overall, the accuracy ranges between 78% to 84% for different data configurations. Although our approach performed at par with the baseline, we did observe some improvements for some POS tags in some experiments. Furthermore, our fine-tuned models are trained on a larger dataset covering larger vocabulary and contexts. We provide an in-depth analysis of the accuracy for each part-of-speech (POS).</abstract>
<identifier type="citekey">malaysha-etal-2023-context</identifier>
<location>
<url>https://aclanthology.org/2023.gwc-1.31</url>
</location>
<part>
<date>2023-01</date>
<extent unit="page">
<start>254</start>
<end>262</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Context-Gloss Augmentation for Improving Arabic Target Sense Verification
%A Malaysha, Sanad
%A Jarrar, Mustafa
%A Khalilia, Mohammed
%Y Rigau, German
%Y Bond, Francis
%Y Rademaker, Alexandre
%S Proceedings of the 12th Global Wordnet Conference
%D 2023
%8 January
%I Global Wordnet Association
%C University of the Basque Country, Donostia - San Sebastian, Basque Country
%F malaysha-etal-2023-context
%X Arabic language lacks semantic datasets and sense inventories. The most common semantically-labeled dataset for Arabic is the ArabGlossBERT, a relatively small dataset that consists of 167K context-gloss pairs (about 60K positive and 107K negative pairs), collected from Arabic dictionaries. This paper presents an enrichment to the ArabGlossBERT dataset, by augmenting it using (Arabic-English-Arabic) machine back-translation. Augmentation increased the dataset size to 352K pairs (149K positive and 203K negative pairs). We measure the impact of augmentation using different data configurations to fine-tune BERT on target sense verification (TSV) task. Overall, the accuracy ranges between 78% to 84% for different data configurations. Although our approach performed at par with the baseline, we did observe some improvements for some POS tags in some experiments. Furthermore, our fine-tuned models are trained on a larger dataset covering larger vocabulary and contexts. We provide an in-depth analysis of the accuracy for each part-of-speech (POS).
%U https://aclanthology.org/2023.gwc-1.31
%P 254-262
Markdown (Informal)
[Context-Gloss Augmentation for Improving Arabic Target Sense Verification](https://aclanthology.org/2023.gwc-1.31) (Malaysha et al., GWC 2023)
ACL