@inproceedings{mehak-etal-2026-enabling,
title = "Enabling Structured Reasoning in {S}indhi with Culturally Grounded Instruction Tuning",
author = "Mehak, Mehak and
Zeinalipour, Kamyar and
Soomro, Pireh and
Chesi, Cristiano and
Gori, Marco and
Maggini, Marco",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.22/",
pages = "239--258",
ISBN = "979-8-89176-377-7",
abstract = "While Large Language Models (LLMs) excel in high-resource contexts, reasoning capabilities in low-resource languages (LRLs) like Sindhi remain limited. To bridge this gap, we introduce Sindhi-Reasoning-Instruct, the first culturally grounded Sindhi instruction corpus. We fine-tuned six LLaMA and Mistral models (1B{--}24B) to evaluate if parameter-efficient tuning enables deductive, inductive, and causal reasoning. Results demonstrate that linguistically authentic data is the decisive factor. Fine-tuning effectively restored Sindhi{'}s Perso-Arabic orthography and SOV structure, with the Mistral-Small-24B model achieving a massive 141{\%} relative improvement in human quality ratings over its base version. Furthermore, structured reasoning capabilities were found to scale with model size; while smaller models achieved high fluency, Mistral-Small-24B achieved top performance across logical categories, reaching 83{\%} on inductive reasoning tasks. This study provides empirical evidence that expert-curated, native instruction data allows LRL models to move beyond simple translation toward robust, structured reasoning. The dataset and models are publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mehak-etal-2026-enabling">
<titleInfo>
<title>Enabling Structured Reasoning in Sindhi with Culturally Grounded Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehak</namePart>
<namePart type="family">Mehak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kamyar</namePart>
<namePart type="family">Zeinalipour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pireh</namePart>
<namePart type="family">Soomro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristiano</namePart>
<namePart type="family">Chesi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Gori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Maggini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>While Large Language Models (LLMs) excel in high-resource contexts, reasoning capabilities in low-resource languages (LRLs) like Sindhi remain limited. To bridge this gap, we introduce Sindhi-Reasoning-Instruct, the first culturally grounded Sindhi instruction corpus. We fine-tuned six LLaMA and Mistral models (1B–24B) to evaluate if parameter-efficient tuning enables deductive, inductive, and causal reasoning. Results demonstrate that linguistically authentic data is the decisive factor. Fine-tuning effectively restored Sindhi’s Perso-Arabic orthography and SOV structure, with the Mistral-Small-24B model achieving a massive 141% relative improvement in human quality ratings over its base version. Furthermore, structured reasoning capabilities were found to scale with model size; while smaller models achieved high fluency, Mistral-Small-24B achieved top performance across logical categories, reaching 83% on inductive reasoning tasks. This study provides empirical evidence that expert-curated, native instruction data allows LRL models to move beyond simple translation toward robust, structured reasoning. The dataset and models are publicly available.</abstract>
<identifier type="citekey">mehak-etal-2026-enabling</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.22/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>239</start>
<end>258</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enabling Structured Reasoning in Sindhi with Culturally Grounded Instruction Tuning
%A Mehak, Mehak
%A Zeinalipour, Kamyar
%A Soomro, Pireh
%A Chesi, Cristiano
%A Gori, Marco
%A Maggini, Marco
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F mehak-etal-2026-enabling
%X While Large Language Models (LLMs) excel in high-resource contexts, reasoning capabilities in low-resource languages (LRLs) like Sindhi remain limited. To bridge this gap, we introduce Sindhi-Reasoning-Instruct, the first culturally grounded Sindhi instruction corpus. We fine-tuned six LLaMA and Mistral models (1B–24B) to evaluate if parameter-efficient tuning enables deductive, inductive, and causal reasoning. Results demonstrate that linguistically authentic data is the decisive factor. Fine-tuning effectively restored Sindhi’s Perso-Arabic orthography and SOV structure, with the Mistral-Small-24B model achieving a massive 141% relative improvement in human quality ratings over its base version. Furthermore, structured reasoning capabilities were found to scale with model size; while smaller models achieved high fluency, Mistral-Small-24B achieved top performance across logical categories, reaching 83% on inductive reasoning tasks. This study provides empirical evidence that expert-curated, native instruction data allows LRL models to move beyond simple translation toward robust, structured reasoning. The dataset and models are publicly available.
%U https://aclanthology.org/2026.loreslm-1.22/
%P 239-258
Markdown (Informal)
[Enabling Structured Reasoning in Sindhi with Culturally Grounded Instruction Tuning](https://aclanthology.org/2026.loreslm-1.22/) (Mehak et al., LoResLM 2026)
ACL