@inproceedings{bui-etal-2025-vmlu,
title = "{VMLU} Benchmarks: A comprehensive benchmark toolkit for {V}ietnamese {LLM}s",
author = "Bui, Cuc Thi and
Son, Nguyen Truong and
Trang, Truong Van and
Phung, Lam Viet and
Huy, Pham Nhut and
Le, Hoang Anh and
Van, Quoc Huu and
Do, Phong Nguyen-Thuan and
Truc, Van Le Tran and
Chau, Duc Thanh and
Nguyen, Le-Minh",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.563/",
doi = "10.18653/v1/2025.acl-long.563",
pages = "11495--11515",
ISBN = "979-8-89176-251-0",
abstract = "The evolution of Large Language Models (LLMs) has underscored the necessity for benchmarks designed for various languages and cultural contexts. To address this need for Vietnamese, we present the first Vietnamese Multitask Language Understanding (VMLU) Benchmarks. The VMLU benchmarks consist of four datasets that assess different capabilities of LLMs, including general knowledge, reading comprehension, reasoning, and conversational skills. This paper also provides an insightful overview of the current state of some dominant LLMs, such as Llama-3, Qwen2.5, and GPT-4, highlighting their performances and limitations when measured against these benchmarks. Furthermore, we provide insights into how prompt design can influence VMLU{'}s evaluation outcomes, as well as suggest that open-source LLMs can serve as effective, cost-efficient evaluators within the Vietnamese context. By offering a comprehensive and accessible benchmarking framework, the VMLU Benchmarks aim to foster the development and fine-tuning of Vietnamese LLMs, thereby establishing a foundation for their practical applications in language-specific domains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bui-etal-2025-vmlu">
<titleInfo>
<title>VMLU Benchmarks: A comprehensive benchmark toolkit for Vietnamese LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cuc</namePart>
<namePart type="given">Thi</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Truong</namePart>
<namePart type="family">Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Truong</namePart>
<namePart type="given">Van</namePart>
<namePart type="family">Trang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lam</namePart>
<namePart type="given">Viet</namePart>
<namePart type="family">Phung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pham</namePart>
<namePart type="given">Nhut</namePart>
<namePart type="family">Huy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hoang</namePart>
<namePart type="given">Anh</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quoc</namePart>
<namePart type="given">Huu</namePart>
<namePart type="family">Van</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phong</namePart>
<namePart type="given">Nguyen-Thuan</namePart>
<namePart type="family">Do</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Van</namePart>
<namePart type="given">Le</namePart>
<namePart type="given">Tran</namePart>
<namePart type="family">Truc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Duc</namePart>
<namePart type="given">Thanh</namePart>
<namePart type="family">Chau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Le-Minh</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>The evolution of Large Language Models (LLMs) has underscored the necessity for benchmarks designed for various languages and cultural contexts. To address this need for Vietnamese, we present the first Vietnamese Multitask Language Understanding (VMLU) Benchmarks. The VMLU benchmarks consist of four datasets that assess different capabilities of LLMs, including general knowledge, reading comprehension, reasoning, and conversational skills. This paper also provides an insightful overview of the current state of some dominant LLMs, such as Llama-3, Qwen2.5, and GPT-4, highlighting their performances and limitations when measured against these benchmarks. Furthermore, we provide insights into how prompt design can influence VMLU’s evaluation outcomes, as well as suggest that open-source LLMs can serve as effective, cost-efficient evaluators within the Vietnamese context. By offering a comprehensive and accessible benchmarking framework, the VMLU Benchmarks aim to foster the development and fine-tuning of Vietnamese LLMs, thereby establishing a foundation for their practical applications in language-specific domains.</abstract>
<identifier type="citekey">bui-etal-2025-vmlu</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.563</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.563/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11495</start>
<end>11515</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VMLU Benchmarks: A comprehensive benchmark toolkit for Vietnamese LLMs
%A Bui, Cuc Thi
%A Son, Nguyen Truong
%A Trang, Truong Van
%A Phung, Lam Viet
%A Huy, Pham Nhut
%A Le, Hoang Anh
%A Van, Quoc Huu
%A Do, Phong Nguyen-Thuan
%A Truc, Van Le Tran
%A Chau, Duc Thanh
%A Nguyen, Le-Minh
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F bui-etal-2025-vmlu
%X The evolution of Large Language Models (LLMs) has underscored the necessity for benchmarks designed for various languages and cultural contexts. To address this need for Vietnamese, we present the first Vietnamese Multitask Language Understanding (VMLU) Benchmarks. The VMLU benchmarks consist of four datasets that assess different capabilities of LLMs, including general knowledge, reading comprehension, reasoning, and conversational skills. This paper also provides an insightful overview of the current state of some dominant LLMs, such as Llama-3, Qwen2.5, and GPT-4, highlighting their performances and limitations when measured against these benchmarks. Furthermore, we provide insights into how prompt design can influence VMLU’s evaluation outcomes, as well as suggest that open-source LLMs can serve as effective, cost-efficient evaluators within the Vietnamese context. By offering a comprehensive and accessible benchmarking framework, the VMLU Benchmarks aim to foster the development and fine-tuning of Vietnamese LLMs, thereby establishing a foundation for their practical applications in language-specific domains.
%R 10.18653/v1/2025.acl-long.563
%U https://aclanthology.org/2025.acl-long.563/
%U https://doi.org/10.18653/v1/2025.acl-long.563
%P 11495-11515
Markdown (Informal)
[VMLU Benchmarks: A comprehensive benchmark toolkit for Vietnamese LLMs](https://aclanthology.org/2025.acl-long.563/) (Bui et al., ACL 2025)
ACL
- Cuc Thi Bui, Nguyen Truong Son, Truong Van Trang, Lam Viet Phung, Pham Nhut Huy, Hoang Anh Le, Quoc Huu Van, Phong Nguyen-Thuan Do, Van Le Tran Truc, Duc Thanh Chau, and Le-Minh Nguyen. 2025. VMLU Benchmarks: A comprehensive benchmark toolkit for Vietnamese LLMs. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 11495–11515, Vienna, Austria. Association for Computational Linguistics.