@inproceedings{abdelaziz-etal-2024-llm,
title = "{LLM}-based {MT} Data Creation: Dialectal to {MSA} Translation Shared Task",
author = "Abdelaziz, AhmedElmogtaba Abdelmoniem Ali and
Elneima, Ashraf Hatim and
Darwish, Kareem",
editor = "Al-Khalifa, Hend and
Darwish, Kareem and
Mubarak, Hamdy and
Ali, Mona and
Elsayed, Tamer",
booktitle = "Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.osact-1.14",
pages = "112--116",
abstract = "This paper presents our approach to the Dialect to Modern Standard Arabic (MSA) Machine Translation shared task, conducted as part of the sixth Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6). Our primary contribution is the development of a novel dataset derived from The Saudi Audio Dataset for Arabic (SADA) an Arabic audio corpus. By employing an automated method utilizing ChatGPT 3.5, we translated the dialectal Arabic texts to their MSA equivalents. This process not only yielded a unique and valuable dataset but also showcased an efficient method for leveraging language models in dataset generation. Utilizing this dataset, alongside additional resources, we trained a machine translation model based on the Transformer architecture. Through systematic experimentation with model configurations, we achieved notable improvements in translation quality. Our findings highlight the significance of LLM-assisted dataset creation methodologies and their impact on advancing machine translation systems, particularly for languages with considerable dialectal diversity like Arabic.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdelaziz-etal-2024-llm">
<titleInfo>
<title>LLM-based MT Data Creation: Dialectal to MSA Translation Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">AhmedElmogtaba</namePart>
<namePart type="given">Abdelmoniem</namePart>
<namePart type="given">Ali</namePart>
<namePart type="family">Abdelaziz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashraf</namePart>
<namePart type="given">Hatim</namePart>
<namePart type="family">Elneima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamdy</namePart>
<namePart type="family">Mubarak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tamer</namePart>
<namePart type="family">Elsayed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents our approach to the Dialect to Modern Standard Arabic (MSA) Machine Translation shared task, conducted as part of the sixth Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6). Our primary contribution is the development of a novel dataset derived from The Saudi Audio Dataset for Arabic (SADA) an Arabic audio corpus. By employing an automated method utilizing ChatGPT 3.5, we translated the dialectal Arabic texts to their MSA equivalents. This process not only yielded a unique and valuable dataset but also showcased an efficient method for leveraging language models in dataset generation. Utilizing this dataset, alongside additional resources, we trained a machine translation model based on the Transformer architecture. Through systematic experimentation with model configurations, we achieved notable improvements in translation quality. Our findings highlight the significance of LLM-assisted dataset creation methodologies and their impact on advancing machine translation systems, particularly for languages with considerable dialectal diversity like Arabic.</abstract>
<identifier type="citekey">abdelaziz-etal-2024-llm</identifier>
<location>
<url>https://aclanthology.org/2024.osact-1.14</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>112</start>
<end>116</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM-based MT Data Creation: Dialectal to MSA Translation Shared Task
%A Abdelaziz, AhmedElmogtaba Abdelmoniem Ali
%A Elneima, Ashraf Hatim
%A Darwish, Kareem
%Y Al-Khalifa, Hend
%Y Darwish, Kareem
%Y Mubarak, Hamdy
%Y Ali, Mona
%Y Elsayed, Tamer
%S Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F abdelaziz-etal-2024-llm
%X This paper presents our approach to the Dialect to Modern Standard Arabic (MSA) Machine Translation shared task, conducted as part of the sixth Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6). Our primary contribution is the development of a novel dataset derived from The Saudi Audio Dataset for Arabic (SADA) an Arabic audio corpus. By employing an automated method utilizing ChatGPT 3.5, we translated the dialectal Arabic texts to their MSA equivalents. This process not only yielded a unique and valuable dataset but also showcased an efficient method for leveraging language models in dataset generation. Utilizing this dataset, alongside additional resources, we trained a machine translation model based on the Transformer architecture. Through systematic experimentation with model configurations, we achieved notable improvements in translation quality. Our findings highlight the significance of LLM-assisted dataset creation methodologies and their impact on advancing machine translation systems, particularly for languages with considerable dialectal diversity like Arabic.
%U https://aclanthology.org/2024.osact-1.14
%P 112-116
Markdown (Informal)
[LLM-based MT Data Creation: Dialectal to MSA Translation Shared Task](https://aclanthology.org/2024.osact-1.14) (Abdelaziz et al., OSACT-WS 2024)
ACL
- AhmedElmogtaba Abdelmoniem Ali Abdelaziz, Ashraf Hatim Elneima, and Kareem Darwish. 2024. LLM-based MT Data Creation: Dialectal to MSA Translation Shared Task. In Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024, pages 112–116, Torino, Italia. ELRA and ICCL.