@inproceedings{aghadavoud-jolfaei-etal-2025-persiansciqa,
title = "{P}ersian{S}ci{QA}: A New Dataset for Bridging the Language Gap in Scientific Question Answering",
author = "Aghadavoud Jolfaei, Safoura and
Mohebi, Azadeh and
Hemmat, Zahra",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.4/",
pages = "32--37",
abstract = "The shortage of specialized datasets hinders the development of Natural Language Processing (NLP) for scientific texts in low-resource languages such as Persian. To address this, we introduce PersianSciQA , a large-scale resource of 39,809 question- answer snippet pairs, each containing a question and a scientific answer snippet from a scientific engineering abstract source from IranDoc{'}s `Ganj' repository, linked by an LLM-assigned relevance score (0-3) that measures how relevant the question is to the content of the accompanying answer snippet. The dataset was generated using a two stage prompting methodology and refined through a rigorous cleaning pipe-line, including text normalization and semantic deduplication. Human validation of 1,000 instances by two NLP researchers confirmed the dataset{'}s quality and a substantial LLM-human agreement (Cohen{'}s kappa coefficient {\ensuremath{\kappa}}=0.6642). To demonstrate its value, we establish baseline benchmarks and show that fine-tuning on PersianSciQA dramatically improves a state-of-the-art model, achieving a Spearman correlation of 0.895 on a blind test set. PersianSciQA provides a crucial new resource to facilitate research in information retrieval and question answering within the Persian scientific domain."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aghadavoud-jolfaei-etal-2025-persiansciqa">
<titleInfo>
<title>PersianSciQA: A New Dataset for Bridging the Language Gap in Scientific Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Safoura</namePart>
<namePart type="family">Aghadavoud Jolfaei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Azadeh</namePart>
<namePart type="family">Mohebi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zahra</namePart>
<namePart type="family">Hemmat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The shortage of specialized datasets hinders the development of Natural Language Processing (NLP) for scientific texts in low-resource languages such as Persian. To address this, we introduce PersianSciQA , a large-scale resource of 39,809 question- answer snippet pairs, each containing a question and a scientific answer snippet from a scientific engineering abstract source from IranDoc’s ‘Ganj’ repository, linked by an LLM-assigned relevance score (0-3) that measures how relevant the question is to the content of the accompanying answer snippet. The dataset was generated using a two stage prompting methodology and refined through a rigorous cleaning pipe-line, including text normalization and semantic deduplication. Human validation of 1,000 instances by two NLP researchers confirmed the dataset’s quality and a substantial LLM-human agreement (Cohen’s kappa coefficient \ensuremathąppa=0.6642). To demonstrate its value, we establish baseline benchmarks and show that fine-tuning on PersianSciQA dramatically improves a state-of-the-art model, achieving a Spearman correlation of 0.895 on a blind test set. PersianSciQA provides a crucial new resource to facilitate research in information retrieval and question answering within the Persian scientific domain.</abstract>
<identifier type="citekey">aghadavoud-jolfaei-etal-2025-persiansciqa</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.4/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>32</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PersianSciQA: A New Dataset for Bridging the Language Gap in Scientific Question Answering
%A Aghadavoud Jolfaei, Safoura
%A Mohebi, Azadeh
%A Hemmat, Zahra
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F aghadavoud-jolfaei-etal-2025-persiansciqa
%X The shortage of specialized datasets hinders the development of Natural Language Processing (NLP) for scientific texts in low-resource languages such as Persian. To address this, we introduce PersianSciQA , a large-scale resource of 39,809 question- answer snippet pairs, each containing a question and a scientific answer snippet from a scientific engineering abstract source from IranDoc’s ‘Ganj’ repository, linked by an LLM-assigned relevance score (0-3) that measures how relevant the question is to the content of the accompanying answer snippet. The dataset was generated using a two stage prompting methodology and refined through a rigorous cleaning pipe-line, including text normalization and semantic deduplication. Human validation of 1,000 instances by two NLP researchers confirmed the dataset’s quality and a substantial LLM-human agreement (Cohen’s kappa coefficient \ensuremathąppa=0.6642). To demonstrate its value, we establish baseline benchmarks and show that fine-tuning on PersianSciQA dramatically improves a state-of-the-art model, achieving a Spearman correlation of 0.895 on a blind test set. PersianSciQA provides a crucial new resource to facilitate research in information retrieval and question answering within the Persian scientific domain.
%U https://aclanthology.org/2025.ranlp-1.4/
%P 32-37
Markdown (Informal)
[PersianSciQA: A New Dataset for Bridging the Language Gap in Scientific Question Answering](https://aclanthology.org/2025.ranlp-1.4/) (Aghadavoud Jolfaei et al., RANLP 2025)
ACL