@inproceedings{cahyawijaya-etal-2024-cendol,
title = "Cendol: Open Instruction-tuned Generative Large Language Models for {I}ndonesian Languages",
author = "Cahyawijaya, Samuel and
Lovenia, Holy and
Koto, Fajri and
Putri, Rifki Afina and
Dave, Emmanuel and
Lee, Jhonson and
Shadieq, Nuur and
Cenggoro, Wawan and
Akbar, Salsabil Maulana and
Mahendra, Muhammad Ihza and
Putri, Dea Annisayanti and
Wilie, Bryan and
Winata, Genta Indra and
Aji, Alham Fikri and
Purwarianti, Ayu and
Fung, Pascale",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.796/",
doi = "10.18653/v1/2024.acl-long.796",
pages = "14899--14914",
abstract = "Large language models (LLMs) show remarkable human-like capability in various domains and languages. To bridge this quality gap, we introduce Cendol, a collection of Indonesian LLMs encompassing both decoder-only and encoder-decoder architectures across a range of model sizes. We highlight Cendol{'}s effectiveness across a diverse array of tasks, attaining {\textasciitilde}20{\%} improvement, and demonstrate its capability to generalize to unseen tasks and indigenous languages of Indonesia. Furthermore, Cendol models showcase improved human favorability despite their limitations in capturing indigenous knowledge and cultural values in Indonesia. In addition, we discuss the shortcomings of parameter-efficient tunings, such as LoRA, for language adaptation. Alternatively, we propose the usage of vocabulary adaptation to enhance efficiency. Lastly, we evaluate the safety of Cendol and showcase that safety in pre-training in one language such as English is transferable to low-resource languages, such as Indonesian, even without RLHF and safety fine-tuning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cahyawijaya-etal-2024-cendol">
<titleInfo>
<title>Cendol: Open Instruction-tuned Generative Large Language Models for Indonesian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Cahyawijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Holy</namePart>
<namePart type="family">Lovenia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fajri</namePart>
<namePart type="family">Koto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rifki</namePart>
<namePart type="given">Afina</namePart>
<namePart type="family">Putri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Dave</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jhonson</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuur</namePart>
<namePart type="family">Shadieq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wawan</namePart>
<namePart type="family">Cenggoro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salsabil</namePart>
<namePart type="given">Maulana</namePart>
<namePart type="family">Akbar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Ihza</namePart>
<namePart type="family">Mahendra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dea</namePart>
<namePart type="given">Annisayanti</namePart>
<namePart type="family">Putri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Wilie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="given">Indra</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pascale</namePart>
<namePart type="family">Fung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) show remarkable human-like capability in various domains and languages. To bridge this quality gap, we introduce Cendol, a collection of Indonesian LLMs encompassing both decoder-only and encoder-decoder architectures across a range of model sizes. We highlight Cendol’s effectiveness across a diverse array of tasks, attaining ~20% improvement, and demonstrate its capability to generalize to unseen tasks and indigenous languages of Indonesia. Furthermore, Cendol models showcase improved human favorability despite their limitations in capturing indigenous knowledge and cultural values in Indonesia. In addition, we discuss the shortcomings of parameter-efficient tunings, such as LoRA, for language adaptation. Alternatively, we propose the usage of vocabulary adaptation to enhance efficiency. Lastly, we evaluate the safety of Cendol and showcase that safety in pre-training in one language such as English is transferable to low-resource languages, such as Indonesian, even without RLHF and safety fine-tuning.</abstract>
<identifier type="citekey">cahyawijaya-etal-2024-cendol</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.796</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.796/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>14899</start>
<end>14914</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cendol: Open Instruction-tuned Generative Large Language Models for Indonesian Languages
%A Cahyawijaya, Samuel
%A Lovenia, Holy
%A Koto, Fajri
%A Putri, Rifki Afina
%A Dave, Emmanuel
%A Lee, Jhonson
%A Shadieq, Nuur
%A Cenggoro, Wawan
%A Akbar, Salsabil Maulana
%A Mahendra, Muhammad Ihza
%A Putri, Dea Annisayanti
%A Wilie, Bryan
%A Winata, Genta Indra
%A Aji, Alham Fikri
%A Purwarianti, Ayu
%A Fung, Pascale
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F cahyawijaya-etal-2024-cendol
%X Large language models (LLMs) show remarkable human-like capability in various domains and languages. To bridge this quality gap, we introduce Cendol, a collection of Indonesian LLMs encompassing both decoder-only and encoder-decoder architectures across a range of model sizes. We highlight Cendol’s effectiveness across a diverse array of tasks, attaining ~20% improvement, and demonstrate its capability to generalize to unseen tasks and indigenous languages of Indonesia. Furthermore, Cendol models showcase improved human favorability despite their limitations in capturing indigenous knowledge and cultural values in Indonesia. In addition, we discuss the shortcomings of parameter-efficient tunings, such as LoRA, for language adaptation. Alternatively, we propose the usage of vocabulary adaptation to enhance efficiency. Lastly, we evaluate the safety of Cendol and showcase that safety in pre-training in one language such as English is transferable to low-resource languages, such as Indonesian, even without RLHF and safety fine-tuning.
%R 10.18653/v1/2024.acl-long.796
%U https://aclanthology.org/2024.acl-long.796/
%U https://doi.org/10.18653/v1/2024.acl-long.796
%P 14899-14914
Markdown (Informal)
[Cendol: Open Instruction-tuned Generative Large Language Models for Indonesian Languages](https://aclanthology.org/2024.acl-long.796/) (Cahyawijaya et al., ACL 2024)
ACL
- Samuel Cahyawijaya, Holy Lovenia, Fajri Koto, Rifki Afina Putri, Emmanuel Dave, Jhonson Lee, Nuur Shadieq, Wawan Cenggoro, Salsabil Maulana Akbar, Muhammad Ihza Mahendra, Dea Annisayanti Putri, Bryan Wilie, Genta Indra Winata, Alham Fikri Aji, Ayu Purwarianti, and Pascale Fung. 2024. Cendol: Open Instruction-tuned Generative Large Language Models for Indonesian Languages. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 14899–14914, Bangkok, Thailand. Association for Computational Linguistics.