@inproceedings{omolaoye-etal-2025-sllama,
title = "{SL}lama: Parameter-Efficient Language Model Architecture for Enhanced Linguistic Competence Under Strict Data Constraints",
author = "Omolaoye, Victor Adelakun and
Owoyele, Babajide Alamu and
de Melo, Gerard",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1198/",
doi = "10.18653/v1/2025.emnlp-main.1198",
pages = "23480--23495",
ISBN = "979-8-89176-332-6",
abstract = "Scaling data and model size has driven recent advances in language modeling, but this strategy falters under scenarios with strict data constraints, as in the BabyLM Challenge. However, insights from Chinchilla highlights that smaller models trained on more data outperform larger counterparts trained inadequately, emphasizing the need for compact architectures. Furthermore, while embedding weight tying is a common parameter-saving technique, we find it significantly diminishes linguistic competence in compact models.In response, we explore alternative architectural strategies that preserve the parameter efficiency of tied models without sacrificing the representational benefits of untied embeddings. Consequently, we introduce SLlama a Llama3 architecture variant which incorporates targeted modifications{---}Repeated Reduced Hidden Size and Projection (RRHP), Permutated Weight Attention (PWA), Shared Projection Multi-Layer Perceptron (SPMLP), and Layer Weight Sharing{---}to compress Transformer components. Without relying on distillation, SLlama achieves a 31.72{\%} improvement in linguistic knowledge acquisition over the BabyLlama baseline, with a comparable GLUE score and significantly lower parameter count. These results demonstrate that well-designed, compact models can rival larger ones under strict data constraints."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="omolaoye-etal-2025-sllama">
<titleInfo>
<title>SLlama: Parameter-Efficient Language Model Architecture for Enhanced Linguistic Competence Under Strict Data Constraints</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="given">Adelakun</namePart>
<namePart type="family">Omolaoye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Babajide</namePart>
<namePart type="given">Alamu</namePart>
<namePart type="family">Owoyele</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">de Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Scaling data and model size has driven recent advances in language modeling, but this strategy falters under scenarios with strict data constraints, as in the BabyLM Challenge. However, insights from Chinchilla highlights that smaller models trained on more data outperform larger counterparts trained inadequately, emphasizing the need for compact architectures. Furthermore, while embedding weight tying is a common parameter-saving technique, we find it significantly diminishes linguistic competence in compact models.In response, we explore alternative architectural strategies that preserve the parameter efficiency of tied models without sacrificing the representational benefits of untied embeddings. Consequently, we introduce SLlama a Llama3 architecture variant which incorporates targeted modifications—Repeated Reduced Hidden Size and Projection (RRHP), Permutated Weight Attention (PWA), Shared Projection Multi-Layer Perceptron (SPMLP), and Layer Weight Sharing—to compress Transformer components. Without relying on distillation, SLlama achieves a 31.72% improvement in linguistic knowledge acquisition over the BabyLlama baseline, with a comparable GLUE score and significantly lower parameter count. These results demonstrate that well-designed, compact models can rival larger ones under strict data constraints.</abstract>
<identifier type="citekey">omolaoye-etal-2025-sllama</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1198</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1198/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>23480</start>
<end>23495</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SLlama: Parameter-Efficient Language Model Architecture for Enhanced Linguistic Competence Under Strict Data Constraints
%A Omolaoye, Victor Adelakun
%A Owoyele, Babajide Alamu
%A de Melo, Gerard
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F omolaoye-etal-2025-sllama
%X Scaling data and model size has driven recent advances in language modeling, but this strategy falters under scenarios with strict data constraints, as in the BabyLM Challenge. However, insights from Chinchilla highlights that smaller models trained on more data outperform larger counterparts trained inadequately, emphasizing the need for compact architectures. Furthermore, while embedding weight tying is a common parameter-saving technique, we find it significantly diminishes linguistic competence in compact models.In response, we explore alternative architectural strategies that preserve the parameter efficiency of tied models without sacrificing the representational benefits of untied embeddings. Consequently, we introduce SLlama a Llama3 architecture variant which incorporates targeted modifications—Repeated Reduced Hidden Size and Projection (RRHP), Permutated Weight Attention (PWA), Shared Projection Multi-Layer Perceptron (SPMLP), and Layer Weight Sharing—to compress Transformer components. Without relying on distillation, SLlama achieves a 31.72% improvement in linguistic knowledge acquisition over the BabyLlama baseline, with a comparable GLUE score and significantly lower parameter count. These results demonstrate that well-designed, compact models can rival larger ones under strict data constraints.
%R 10.18653/v1/2025.emnlp-main.1198
%U https://aclanthology.org/2025.emnlp-main.1198/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1198
%P 23480-23495
Markdown (Informal)
[SLlama: Parameter-Efficient Language Model Architecture for Enhanced Linguistic Competence Under Strict Data Constraints](https://aclanthology.org/2025.emnlp-main.1198/) (Omolaoye et al., EMNLP 2025)
ACL