@inproceedings{zain-etal-2025-single,
title = "Single layer tiny Co4 outpaces {GPT}-2 and {GPT}-{BERT}",
author = "Zain, Noor Ul and
Naseem, Mohsin Raza and
Adeel, Ahsan",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.babylm-main.24/",
pages = "313--322",
ISBN = "TODO",
abstract = "We show that a tiny $Co^{4}$ machine (CITATION) with a single layer, two heads, and 8M parameters, operating at $O(N)$ computational cost (where \textit{N} is the number of input tokens), in just 2 epochs outpaces GPT-2 (124M, 12 layers, $O(N^2)$) and GPT-BERT (30M, 12 layers, $O(N^2)$), both trained for 10 epochs. $Co^{4}$ achieves orders-of-magnitude greater training efficiency on 10M tokens, demonstrating sample-efficient pretraining. On the BabyLM challenge evaluation pipeline, $Co^{4}$ performs comparably or better across complex benchmarks, showing strong zero-shot and fine-tuning performance on SuperGLUE tasks. Specifically, $Co^{4}$ outperforms GPT-2 in 5 out of 7 zero-shot metrics and 6 out of 7 fine-tuning tasks, and GPT-BERT in 4 out of 7 metrics in both cases. These results strongly suggest a need to rethink prevailing deep learning paradigms and associated scaling laws."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zain-etal-2025-single">
<titleInfo>
<title>Single layer tiny Co4 outpaces GPT-2 and GPT-BERT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noor</namePart>
<namePart type="given">Ul</namePart>
<namePart type="family">Zain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohsin</namePart>
<namePart type="given">Raza</namePart>
<namePart type="family">Naseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahsan</namePart>
<namePart type="family">Adeel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First BabyLM Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Charpentier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="given">Omer</namePart>
<namePart type="family">Gul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="given">Sanjay</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">TODO</identifier>
</relatedItem>
<abstract>We show that a tiny Co⁴ machine (CITATION) with a single layer, two heads, and 8M parameters, operating at O(N) computational cost (where N is the number of input tokens), in just 2 epochs outpaces GPT-2 (124M, 12 layers, O(N²)) and GPT-BERT (30M, 12 layers, O(N²)), both trained for 10 epochs. Co⁴ achieves orders-of-magnitude greater training efficiency on 10M tokens, demonstrating sample-efficient pretraining. On the BabyLM challenge evaluation pipeline, Co⁴ performs comparably or better across complex benchmarks, showing strong zero-shot and fine-tuning performance on SuperGLUE tasks. Specifically, Co⁴ outperforms GPT-2 in 5 out of 7 zero-shot metrics and 6 out of 7 fine-tuning tasks, and GPT-BERT in 4 out of 7 metrics in both cases. These results strongly suggest a need to rethink prevailing deep learning paradigms and associated scaling laws.</abstract>
<identifier type="citekey">zain-etal-2025-single</identifier>
<location>
<url>https://aclanthology.org/2025.babylm-main.24/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>313</start>
<end>322</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Single layer tiny Co4 outpaces GPT-2 and GPT-BERT
%A Zain, Noor Ul
%A Naseem, Mohsin Raza
%A Adeel, Ahsan
%Y Charpentier, Lucas
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Gul, Mustafa Omer
%Y Hu, Michael Y.
%Y Liu, Jing
%Y Jumelet, Jaap
%Y Linzen, Tal
%Y Mueller, Aaron
%Y Ross, Candace
%Y Shah, Raj Sanjay
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%Y Williams, Adina
%S Proceedings of the First BabyLM Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ TODO
%F zain-etal-2025-single
%X We show that a tiny Co⁴ machine (CITATION) with a single layer, two heads, and 8M parameters, operating at O(N) computational cost (where N is the number of input tokens), in just 2 epochs outpaces GPT-2 (124M, 12 layers, O(N²)) and GPT-BERT (30M, 12 layers, O(N²)), both trained for 10 epochs. Co⁴ achieves orders-of-magnitude greater training efficiency on 10M tokens, demonstrating sample-efficient pretraining. On the BabyLM challenge evaluation pipeline, Co⁴ performs comparably or better across complex benchmarks, showing strong zero-shot and fine-tuning performance on SuperGLUE tasks. Specifically, Co⁴ outperforms GPT-2 in 5 out of 7 zero-shot metrics and 6 out of 7 fine-tuning tasks, and GPT-BERT in 4 out of 7 metrics in both cases. These results strongly suggest a need to rethink prevailing deep learning paradigms and associated scaling laws.
%U https://aclanthology.org/2025.babylm-main.24/
%P 313-322
Markdown (Informal)
[Single layer tiny Co4 outpaces GPT-2 and GPT-BERT](https://aclanthology.org/2025.babylm-main.24/) (Zain et al., BabyLM 2025)
ACL