@inproceedings{pavlova-2025-multi,
title = "Multi-stage Training of Bilingual Islamic {LLM} for Neural Passage Retrieval",
author = "Pavlova, Vera",
editor = "Yagi, Sane and
Yagi, Sane and
Sawalha, Majdi and
Shawar, Bayan Abu and
AlShdaifat, Abdallah T. and
Abbas, Norhan and
Organizers",
booktitle = "Proceedings of the New Horizons in Computational Linguistics for Religious Texts",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.clrel-1.4/",
pages = "42--52",
abstract = "This study examines the use of Natural Language Processing (NLP) technology within the Islamic domain, focusing on developing an Islamic neural retrieval model. By leveraging the robust XLM-R base model, the research employs a language reduction technique to create a lightweight bilingual large language model (LLM). Our approach for domain adaptation addresses the unique challenges faced in the Islamic domain, where substantial in-domain corpora exist only in Arabic while limited in other languages, including English. The work utilizes a multi-stage training process for retrieval models, incorporating large retrieval datasets, such as MS MARCO, and smaller, in-domain datasets to improve retrieval performance. Additionally, we have curated an in-domain retrieval dataset in English by employing data augmentation techniques and involving a reliable Islamic source. This approach enhances the domain-specific dataset for retrieval, leading to further performance gains. The findings suggest that combining domain adaptation and a multi-stage training method for the bilingual Islamic neural retrieval model enables it to outperform monolingual models on downstream retrieval tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pavlova-2025-multi">
<titleInfo>
<title>Multi-stage Training of Bilingual Islamic LLM for Neural Passage Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Pavlova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the New Horizons in Computational Linguistics for Religious Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sane</namePart>
<namePart type="family">Yagi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Majdi</namePart>
<namePart type="family">Sawalha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bayan</namePart>
<namePart type="given">Abu</namePart>
<namePart type="family">Shawar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdallah</namePart>
<namePart type="given">T</namePart>
<namePart type="family">AlShdaifat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Norhan</namePart>
<namePart type="family">Abbas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name>
<namePart>Organizers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study examines the use of Natural Language Processing (NLP) technology within the Islamic domain, focusing on developing an Islamic neural retrieval model. By leveraging the robust XLM-R base model, the research employs a language reduction technique to create a lightweight bilingual large language model (LLM). Our approach for domain adaptation addresses the unique challenges faced in the Islamic domain, where substantial in-domain corpora exist only in Arabic while limited in other languages, including English. The work utilizes a multi-stage training process for retrieval models, incorporating large retrieval datasets, such as MS MARCO, and smaller, in-domain datasets to improve retrieval performance. Additionally, we have curated an in-domain retrieval dataset in English by employing data augmentation techniques and involving a reliable Islamic source. This approach enhances the domain-specific dataset for retrieval, leading to further performance gains. The findings suggest that combining domain adaptation and a multi-stage training method for the bilingual Islamic neural retrieval model enables it to outperform monolingual models on downstream retrieval tasks.</abstract>
<identifier type="citekey">pavlova-2025-multi</identifier>
<location>
<url>https://aclanthology.org/2025.clrel-1.4/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>42</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-stage Training of Bilingual Islamic LLM for Neural Passage Retrieval
%A Pavlova, Vera
%Y Yagi, Sane
%Y Sawalha, Majdi
%Y Shawar, Bayan Abu
%Y AlShdaifat, Abdallah T.
%Y Abbas, Norhan
%E Organizers
%S Proceedings of the New Horizons in Computational Linguistics for Religious Texts
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F pavlova-2025-multi
%X This study examines the use of Natural Language Processing (NLP) technology within the Islamic domain, focusing on developing an Islamic neural retrieval model. By leveraging the robust XLM-R base model, the research employs a language reduction technique to create a lightweight bilingual large language model (LLM). Our approach for domain adaptation addresses the unique challenges faced in the Islamic domain, where substantial in-domain corpora exist only in Arabic while limited in other languages, including English. The work utilizes a multi-stage training process for retrieval models, incorporating large retrieval datasets, such as MS MARCO, and smaller, in-domain datasets to improve retrieval performance. Additionally, we have curated an in-domain retrieval dataset in English by employing data augmentation techniques and involving a reliable Islamic source. This approach enhances the domain-specific dataset for retrieval, leading to further performance gains. The findings suggest that combining domain adaptation and a multi-stage training method for the bilingual Islamic neural retrieval model enables it to outperform monolingual models on downstream retrieval tasks.
%U https://aclanthology.org/2025.clrel-1.4/
%P 42-52
Markdown (Informal)
[Multi-stage Training of Bilingual Islamic LLM for Neural Passage Retrieval](https://aclanthology.org/2025.clrel-1.4/) (Pavlova, CLRel 2025)
ACL