@inproceedings{hirica-etal-2025-arabic,
title = "{A}rabic to {R}omanian Machine Translation: A Case Study on Distant Language Pairs",
author = "Hirica, Ioan Alexandru and
Tabusca, Stefana Arina and
Nisioi, Sergiu",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.51/",
pages = "423--432",
abstract = "This paper investigates machine translation between two linguistically distant languages, Arabic and Romanian, with a focus on translating from Arabic to Romanian. Dataset cleaning techniques are addressed, offering insights on the impact of translation for a language pair with limited resources. Using publicly available corpora (e.g., OPUS) and manually translated diplomatic texts, filtering methods are applied, such as duplicate removal, embedding similarity analysis (LEALLA), and Large Language Model (LLM)-based validation (Gemini-flash-002). Transformer models are trained and evaluated with diverse preprocessing pipelines that incorporate subword tokenization. Additionally, the performance of a fine-tuned LLM is assessed for this task and is compared to their pre-trained counterparts. Despite computational limitations, the results emphasize the importance of targeted preprocessing and model adaptation in improving Arabic-Romanian translation quality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hirica-etal-2025-arabic">
<titleInfo>
<title>Arabic to Romanian Machine Translation: A Case Study on Distant Language Pairs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ioan</namePart>
<namePart type="given">Alexandru</namePart>
<namePart type="family">Hirica</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefana</namePart>
<namePart type="given">Arina</namePart>
<namePart type="family">Tabusca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergiu</namePart>
<namePart type="family">Nisioi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper investigates machine translation between two linguistically distant languages, Arabic and Romanian, with a focus on translating from Arabic to Romanian. Dataset cleaning techniques are addressed, offering insights on the impact of translation for a language pair with limited resources. Using publicly available corpora (e.g., OPUS) and manually translated diplomatic texts, filtering methods are applied, such as duplicate removal, embedding similarity analysis (LEALLA), and Large Language Model (LLM)-based validation (Gemini-flash-002). Transformer models are trained and evaluated with diverse preprocessing pipelines that incorporate subword tokenization. Additionally, the performance of a fine-tuned LLM is assessed for this task and is compared to their pre-trained counterparts. Despite computational limitations, the results emphasize the importance of targeted preprocessing and model adaptation in improving Arabic-Romanian translation quality.</abstract>
<identifier type="citekey">hirica-etal-2025-arabic</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.51/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>423</start>
<end>432</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Arabic to Romanian Machine Translation: A Case Study on Distant Language Pairs
%A Hirica, Ioan Alexandru
%A Tabusca, Stefana Arina
%A Nisioi, Sergiu
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F hirica-etal-2025-arabic
%X This paper investigates machine translation between two linguistically distant languages, Arabic and Romanian, with a focus on translating from Arabic to Romanian. Dataset cleaning techniques are addressed, offering insights on the impact of translation for a language pair with limited resources. Using publicly available corpora (e.g., OPUS) and manually translated diplomatic texts, filtering methods are applied, such as duplicate removal, embedding similarity analysis (LEALLA), and Large Language Model (LLM)-based validation (Gemini-flash-002). Transformer models are trained and evaluated with diverse preprocessing pipelines that incorporate subword tokenization. Additionally, the performance of a fine-tuned LLM is assessed for this task and is compared to their pre-trained counterparts. Despite computational limitations, the results emphasize the importance of targeted preprocessing and model adaptation in improving Arabic-Romanian translation quality.
%U https://aclanthology.org/2025.ranlp-1.51/
%P 423-432
Markdown (Informal)
[Arabic to Romanian Machine Translation: A Case Study on Distant Language Pairs](https://aclanthology.org/2025.ranlp-1.51/) (Hirica et al., RANLP 2025)
ACL