@inproceedings{bayar-etal-2026-building,
title = "Building a {T}urkish Large Language Model via Continual Pre-Training and Parameter-Efficient Adaptation",
author = {Bayar, Alperen Enes and
Ege, Mert and
Yurtalan, G{\"o}khan and
Karamanlioglu, Alper and
Demirel, Berkan and
Cinbis, Ramazan Gokberk},
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.17/",
pages = "209--219",
ISBN = "979-8-89176-370-8",
abstract = "Large Language Models (LLMs) achieve strong performance on many tasks, but they still struggle with morphologically rich, low-resource languages such as Turkish. This difficulty stems from Turkish being an agglutinative language and underrepresented in multilingual training data, which causes current models to often fail at capturing its morphology, flexible word order, and formal registers. In this paper, we introduce MODA (Model Adapted for Domain Applications), a Turkish-specialized LLM built via a modular pipeline that combines continual pre-training, parameter-efficient fine-tuning, and model merging. Starting from Qwen2.5-7B as the base model, we first perform large-scale continual pre-training on a Turkish web corpus to improve grammatical and morphological representations. We then apply parameter-efficient supervised fine-tuning on task-oriented instruction data, and finally merge specialized variants into a single unified model. We evaluate MODA on TurkishMMLU, the Turkish subset of EXAMS, and TRCLAIM-19, where it consistently outperforms both the base and instruction-tuned Qwen2.5-7B models. Our results support a training strategy that explicitly separates linguistic acquisition from task alignment when adapting LLMs to morphologically rich, underrepresented languages under realistic hardware constraints."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bayar-etal-2026-building">
<titleInfo>
<title>Building a Turkish Large Language Model via Continual Pre-Training and Parameter-Efficient Adaptation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alperen</namePart>
<namePart type="given">Enes</namePart>
<namePart type="family">Bayar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mert</namePart>
<namePart type="family">Ege</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gökhan</namePart>
<namePart type="family">Yurtalan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alper</namePart>
<namePart type="family">Karamanlioglu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Berkan</namePart>
<namePart type="family">Demirel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramazan</namePart>
<namePart type="given">Gokberk</namePart>
<namePart type="family">Cinbis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) achieve strong performance on many tasks, but they still struggle with morphologically rich, low-resource languages such as Turkish. This difficulty stems from Turkish being an agglutinative language and underrepresented in multilingual training data, which causes current models to often fail at capturing its morphology, flexible word order, and formal registers. In this paper, we introduce MODA (Model Adapted for Domain Applications), a Turkish-specialized LLM built via a modular pipeline that combines continual pre-training, parameter-efficient fine-tuning, and model merging. Starting from Qwen2.5-7B as the base model, we first perform large-scale continual pre-training on a Turkish web corpus to improve grammatical and morphological representations. We then apply parameter-efficient supervised fine-tuning on task-oriented instruction data, and finally merge specialized variants into a single unified model. We evaluate MODA on TurkishMMLU, the Turkish subset of EXAMS, and TRCLAIM-19, where it consistently outperforms both the base and instruction-tuned Qwen2.5-7B models. Our results support a training strategy that explicitly separates linguistic acquisition from task alignment when adapting LLMs to morphologically rich, underrepresented languages under realistic hardware constraints.</abstract>
<identifier type="citekey">bayar-etal-2026-building</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.17/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>209</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building a Turkish Large Language Model via Continual Pre-Training and Parameter-Efficient Adaptation
%A Bayar, Alperen Enes
%A Ege, Mert
%A Yurtalan, Gökhan
%A Karamanlioglu, Alper
%A Demirel, Berkan
%A Cinbis, Ramazan Gokberk
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F bayar-etal-2026-building
%X Large Language Models (LLMs) achieve strong performance on many tasks, but they still struggle with morphologically rich, low-resource languages such as Turkish. This difficulty stems from Turkish being an agglutinative language and underrepresented in multilingual training data, which causes current models to often fail at capturing its morphology, flexible word order, and formal registers. In this paper, we introduce MODA (Model Adapted for Domain Applications), a Turkish-specialized LLM built via a modular pipeline that combines continual pre-training, parameter-efficient fine-tuning, and model merging. Starting from Qwen2.5-7B as the base model, we first perform large-scale continual pre-training on a Turkish web corpus to improve grammatical and morphological representations. We then apply parameter-efficient supervised fine-tuning on task-oriented instruction data, and finally merge specialized variants into a single unified model. We evaluate MODA on TurkishMMLU, the Turkish subset of EXAMS, and TRCLAIM-19, where it consistently outperforms both the base and instruction-tuned Qwen2.5-7B models. Our results support a training strategy that explicitly separates linguistic acquisition from task alignment when adapting LLMs to morphologically rich, underrepresented languages under realistic hardware constraints.
%U https://aclanthology.org/2026.sigturk-1.17/
%P 209-219
Markdown (Informal)
[Building a Turkish Large Language Model via Continual Pre-Training and Parameter-Efficient Adaptation](https://aclanthology.org/2026.sigturk-1.17/) (Bayar et al., SIGTURK 2026)
ACL