@inproceedings{dhakal-baral-2025-abstractive,
title = "Abstractive Summarization of Low resourced {N}epali language using Multilingual Transformers",
author = "Dhakal, Prakash and
Baral, Daya Sagar",
editor = "Sarveswaran, Kengatharaiyer and
Vaidya, Ashwini and
Krishna Bal, Bal and
Shams, Sana and
Thapa, Surendrabikram",
booktitle = "Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2025.chipsal-1.12/",
pages = "124--133",
abstract = "Nepali, one of the prominent languages of South Asia, remains underrepresented in natural language processing (NLP) research, particularly in the domain of abstractive summarization. While significant progress has been made in extractive summarization, the complexity of generating coherent, human-like summaries from low-resource languages like Nepali is still largely unexplored. This paper introduces the first comprehensive study on applying multilingual transformer-based models, specifically mBART and mT5, to the task of generating headlines for Nepali news articles through abstractive summarization. Given the absence of large-scale datasets for this task, a new Nepali news headline summarization corpus was created by scraping data from multiple online news portals. The models were fine-tuned with this novel dataset using Low-Rank Adaptation (LoRA) and quantization techniques, allowing for more computationally efficient training while preserving performance. The models' effectiveness was evaluated using ROUGE scores and a human evaluation approach that focused on relevance, fluency, conciseness, informativeness, factual accuracy, and coverage. The findings demonstrate that a 4-bit quantized mBART model achieves superior performance, offering significant potential for improving digital content summarization for Nepali. This study highlights key challenges in processing Nepali, particularly its orthographic and resource limitations, while providing a path forward for advancing NLP tools for South Asian languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dhakal-baral-2025-abstractive">
<titleInfo>
<title>Abstractive Summarization of Low resourced Nepali language using Multilingual Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prakash</namePart>
<namePart type="family">Dhakal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daya</namePart>
<namePart type="given">Sagar</namePart>
<namePart type="family">Baral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kengatharaiyer</namePart>
<namePart type="family">Sarveswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwini</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bal</namePart>
<namePart type="family">Krishna Bal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sana</namePart>
<namePart type="family">Shams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Nepali, one of the prominent languages of South Asia, remains underrepresented in natural language processing (NLP) research, particularly in the domain of abstractive summarization. While significant progress has been made in extractive summarization, the complexity of generating coherent, human-like summaries from low-resource languages like Nepali is still largely unexplored. This paper introduces the first comprehensive study on applying multilingual transformer-based models, specifically mBART and mT5, to the task of generating headlines for Nepali news articles through abstractive summarization. Given the absence of large-scale datasets for this task, a new Nepali news headline summarization corpus was created by scraping data from multiple online news portals. The models were fine-tuned with this novel dataset using Low-Rank Adaptation (LoRA) and quantization techniques, allowing for more computationally efficient training while preserving performance. The models’ effectiveness was evaluated using ROUGE scores and a human evaluation approach that focused on relevance, fluency, conciseness, informativeness, factual accuracy, and coverage. The findings demonstrate that a 4-bit quantized mBART model achieves superior performance, offering significant potential for improving digital content summarization for Nepali. This study highlights key challenges in processing Nepali, particularly its orthographic and resource limitations, while providing a path forward for advancing NLP tools for South Asian languages.</abstract>
<identifier type="citekey">dhakal-baral-2025-abstractive</identifier>
<location>
<url>https://aclanthology.org/2025.chipsal-1.12/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>124</start>
<end>133</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Abstractive Summarization of Low resourced Nepali language using Multilingual Transformers
%A Dhakal, Prakash
%A Baral, Daya Sagar
%Y Sarveswaran, Kengatharaiyer
%Y Vaidya, Ashwini
%Y Krishna Bal, Bal
%Y Shams, Sana
%Y Thapa, Surendrabikram
%S Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)
%D 2025
%8 January
%I International Committee on Computational Linguistics
%C Abu Dhabi, UAE
%F dhakal-baral-2025-abstractive
%X Nepali, one of the prominent languages of South Asia, remains underrepresented in natural language processing (NLP) research, particularly in the domain of abstractive summarization. While significant progress has been made in extractive summarization, the complexity of generating coherent, human-like summaries from low-resource languages like Nepali is still largely unexplored. This paper introduces the first comprehensive study on applying multilingual transformer-based models, specifically mBART and mT5, to the task of generating headlines for Nepali news articles through abstractive summarization. Given the absence of large-scale datasets for this task, a new Nepali news headline summarization corpus was created by scraping data from multiple online news portals. The models were fine-tuned with this novel dataset using Low-Rank Adaptation (LoRA) and quantization techniques, allowing for more computationally efficient training while preserving performance. The models’ effectiveness was evaluated using ROUGE scores and a human evaluation approach that focused on relevance, fluency, conciseness, informativeness, factual accuracy, and coverage. The findings demonstrate that a 4-bit quantized mBART model achieves superior performance, offering significant potential for improving digital content summarization for Nepali. This study highlights key challenges in processing Nepali, particularly its orthographic and resource limitations, while providing a path forward for advancing NLP tools for South Asian languages.
%U https://aclanthology.org/2025.chipsal-1.12/
%P 124-133
Markdown (Informal)
[Abstractive Summarization of Low resourced Nepali language using Multilingual Transformers](https://aclanthology.org/2025.chipsal-1.12/) (Dhakal & Baral, CHiPSAL 2025)
ACL