@inproceedings{wongso-etal-2025-nusabert,
title = "{N}usa{BERT}: Teaching {I}ndo{BERT} to be Multilingual and Multicultural",
author = "Wongso, Wilson and
Setiawan, David Samuel and
Limcorn, Steven and
Joyoadikusumo, Ananto",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the Second Workshop in South East Asian Language Processing",
month = jan,
year = "2025",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sealp-1.2/",
pages = "10--26",
abstract = "We present NusaBERT, a multilingual model built on IndoBERT and tailored for Indonesia`s diverse languages. By expanding vocabulary and pre-training on a regional corpus, NusaBERT achieves state-of-the-art performance on Indonesian NLU benchmarks, enhancing IndoBERT`s multilingual capability. This study also addresses NusaBERT`s limitations and encourages further research on Indonesia`s underrepresented languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wongso-etal-2025-nusabert">
<titleInfo>
<title>NusaBERT: Teaching IndoBERT to be Multilingual and Multicultural</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wilson</namePart>
<namePart type="family">Wongso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Samuel</namePart>
<namePart type="family">Setiawan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Limcorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ananto</namePart>
<namePart type="family">Joyoadikusumo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop in South East Asian Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Derry</namePart>
<namePart type="family">Wijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Vania</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="given">Indra</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present NusaBERT, a multilingual model built on IndoBERT and tailored for Indonesia‘s diverse languages. By expanding vocabulary and pre-training on a regional corpus, NusaBERT achieves state-of-the-art performance on Indonesian NLU benchmarks, enhancing IndoBERT‘s multilingual capability. This study also addresses NusaBERT‘s limitations and encourages further research on Indonesia‘s underrepresented languages.</abstract>
<identifier type="citekey">wongso-etal-2025-nusabert</identifier>
<location>
<url>https://aclanthology.org/2025.sealp-1.2/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NusaBERT: Teaching IndoBERT to be Multilingual and Multicultural
%A Wongso, Wilson
%A Setiawan, David Samuel
%A Limcorn, Steven
%A Joyoadikusumo, Ananto
%Y Wijaya, Derry
%Y Aji, Alham Fikri
%Y Vania, Clara
%Y Winata, Genta Indra
%Y Purwarianti, Ayu
%S Proceedings of the Second Workshop in South East Asian Language Processing
%D 2025
%8 January
%I Association for Computational Linguistics
%C Online
%F wongso-etal-2025-nusabert
%X We present NusaBERT, a multilingual model built on IndoBERT and tailored for Indonesia‘s diverse languages. By expanding vocabulary and pre-training on a regional corpus, NusaBERT achieves state-of-the-art performance on Indonesian NLU benchmarks, enhancing IndoBERT‘s multilingual capability. This study also addresses NusaBERT‘s limitations and encourages further research on Indonesia‘s underrepresented languages.
%U https://aclanthology.org/2025.sealp-1.2/
%P 10-26
Markdown (Informal)
[NusaBERT: Teaching IndoBERT to be Multilingual and Multicultural](https://aclanthology.org/2025.sealp-1.2/) (Wongso et al., sealp 2025)
ACL