@inproceedings{nahin-etal-2025-titullms,
title = "{T}itu{LLM}s: A Family of {B}angla {LLM}s with Comprehensive Benchmarking",
author = "Nahin, Shahriar Kabir and
Nandi, Rabindra Nath and
Sarker, Sagor and
Muhtaseem, Quazi Sarwar and
Kowsher, Md and
Shill, Apu Chandraw and
Ibrahim, Md and
Menon, Mehadi Hasan and
Muntasir, Tareq Al and
Alam, Firoj",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1279/",
doi = "10.18653/v1/2025.findings-acl.1279",
pages = "24922--24940",
ISBN = "979-8-89176-256-5",
abstract = "In this paper, we present TituLLMs, the first large pretrained Bangla LLMs, available in 1b and 3b parameter sizes. Due to computational constraints during both training and inference, we focused on smaller models. To train TituLLMs, we collected a pretraining dataset of approximately {\ensuremath{\sim}} 37 billion tokens. We extended the Llama-3.2 tokenizer to incorporate language- and culture-specific knowledge, which also enables faster training and inference. There was a lack of benchmarking datasets to benchmark LLMs for Bangla. To address this gap, we developed five benchmarking datasets. We benchmarked various LLMs, including TituLLMs, and demonstrated that TituLLMs outperforms its initial multilingual versions. However, this is not always the case, highlighting the complexities of language adaptation. Our work lays the groundwork for adapting existing multilingual open models to other low-resource languages. To facilitate broader adoption and further research, we have made the TituLLMs models and benchmarking datasets publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nahin-etal-2025-titullms">
<titleInfo>
<title>TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shahriar</namePart>
<namePart type="given">Kabir</namePart>
<namePart type="family">Nahin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rabindra</namePart>
<namePart type="given">Nath</namePart>
<namePart type="family">Nandi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sagor</namePart>
<namePart type="family">Sarker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quazi</namePart>
<namePart type="given">Sarwar</namePart>
<namePart type="family">Muhtaseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Kowsher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apu</namePart>
<namePart type="given">Chandraw</namePart>
<namePart type="family">Shill</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Ibrahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehadi</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Menon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tareq</namePart>
<namePart type="given">Al</namePart>
<namePart type="family">Muntasir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>In this paper, we present TituLLMs, the first large pretrained Bangla LLMs, available in 1b and 3b parameter sizes. Due to computational constraints during both training and inference, we focused on smaller models. To train TituLLMs, we collected a pretraining dataset of approximately \ensuremath\sim 37 billion tokens. We extended the Llama-3.2 tokenizer to incorporate language- and culture-specific knowledge, which also enables faster training and inference. There was a lack of benchmarking datasets to benchmark LLMs for Bangla. To address this gap, we developed five benchmarking datasets. We benchmarked various LLMs, including TituLLMs, and demonstrated that TituLLMs outperforms its initial multilingual versions. However, this is not always the case, highlighting the complexities of language adaptation. Our work lays the groundwork for adapting existing multilingual open models to other low-resource languages. To facilitate broader adoption and further research, we have made the TituLLMs models and benchmarking datasets publicly available.</abstract>
<identifier type="citekey">nahin-etal-2025-titullms</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1279</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1279/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>24922</start>
<end>24940</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking
%A Nahin, Shahriar Kabir
%A Nandi, Rabindra Nath
%A Sarker, Sagor
%A Muhtaseem, Quazi Sarwar
%A Kowsher, Md
%A Shill, Apu Chandraw
%A Ibrahim, Md
%A Menon, Mehadi Hasan
%A Muntasir, Tareq Al
%A Alam, Firoj
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F nahin-etal-2025-titullms
%X In this paper, we present TituLLMs, the first large pretrained Bangla LLMs, available in 1b and 3b parameter sizes. Due to computational constraints during both training and inference, we focused on smaller models. To train TituLLMs, we collected a pretraining dataset of approximately \ensuremath\sim 37 billion tokens. We extended the Llama-3.2 tokenizer to incorporate language- and culture-specific knowledge, which also enables faster training and inference. There was a lack of benchmarking datasets to benchmark LLMs for Bangla. To address this gap, we developed five benchmarking datasets. We benchmarked various LLMs, including TituLLMs, and demonstrated that TituLLMs outperforms its initial multilingual versions. However, this is not always the case, highlighting the complexities of language adaptation. Our work lays the groundwork for adapting existing multilingual open models to other low-resource languages. To facilitate broader adoption and further research, we have made the TituLLMs models and benchmarking datasets publicly available.
%R 10.18653/v1/2025.findings-acl.1279
%U https://aclanthology.org/2025.findings-acl.1279/
%U https://doi.org/10.18653/v1/2025.findings-acl.1279
%P 24922-24940
Markdown (Informal)
[TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking](https://aclanthology.org/2025.findings-acl.1279/) (Nahin et al., Findings 2025)
ACL
- Shahriar Kabir Nahin, Rabindra Nath Nandi, Sagor Sarker, Quazi Sarwar Muhtaseem, Md Kowsher, Apu Chandraw Shill, Md Ibrahim, Mehadi Hasan Menon, Tareq Al Muntasir, and Firoj Alam. 2025. TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking. In Findings of the Association for Computational Linguistics: ACL 2025, pages 24922–24940, Vienna, Austria. Association for Computational Linguistics.