@inproceedings{liu-etal-2025-seaexam,
title = "{S}ea{E}xam and {S}ea{B}ench: Benchmarking {LLM}s with Local Multilingual Questions in {S}outheast {A}sia",
author = "Liu, Chaoqun and
Zhang, Wenxuan and
Ying, Jiahao and
Aljunied, Mahani and
Luu, Anh Tuan and
Bing, Lidong",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.341/",
doi = "10.18653/v1/2025.findings-naacl.341",
pages = "6119--6136",
ISBN = "979-8-89176-195-7",
abstract = "This study introduces two novel benchmarks, SeaExam and SeaBench, designed to evaluate the capabilities of Large Language Models (LLMs) in Southeast Asian (SEA) application scenarios. Unlike existing multilingual datasets primarily derived from English translations, these benchmarks are constructed based on real-world scenarios from SEA regions. SeaExam draws from regional educational exams to form a comprehensive dataset that encompasses subjects such as local history and literature. In contrast, SeaBench is crafted around multi-turn, open-ended tasks that reflect daily interactions within SEA communities. Our evaluations demonstrate that SeaExam and SeaBench more effectively discern LLM performance on SEA language tasks compared to their translated benchmarks. This highlights the importance of using real-world queries to assess the multilingual capabilities of LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-seaexam">
<titleInfo>
<title>SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chaoqun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahao</namePart>
<namePart type="family">Ying</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahani</namePart>
<namePart type="family">Aljunied</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="given">Tuan</namePart>
<namePart type="family">Luu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidong</namePart>
<namePart type="family">Bing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>This study introduces two novel benchmarks, SeaExam and SeaBench, designed to evaluate the capabilities of Large Language Models (LLMs) in Southeast Asian (SEA) application scenarios. Unlike existing multilingual datasets primarily derived from English translations, these benchmarks are constructed based on real-world scenarios from SEA regions. SeaExam draws from regional educational exams to form a comprehensive dataset that encompasses subjects such as local history and literature. In contrast, SeaBench is crafted around multi-turn, open-ended tasks that reflect daily interactions within SEA communities. Our evaluations demonstrate that SeaExam and SeaBench more effectively discern LLM performance on SEA language tasks compared to their translated benchmarks. This highlights the importance of using real-world queries to assess the multilingual capabilities of LLMs.</abstract>
<identifier type="citekey">liu-etal-2025-seaexam</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.341</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.341/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6119</start>
<end>6136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia
%A Liu, Chaoqun
%A Zhang, Wenxuan
%A Ying, Jiahao
%A Aljunied, Mahani
%A Luu, Anh Tuan
%A Bing, Lidong
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F liu-etal-2025-seaexam
%X This study introduces two novel benchmarks, SeaExam and SeaBench, designed to evaluate the capabilities of Large Language Models (LLMs) in Southeast Asian (SEA) application scenarios. Unlike existing multilingual datasets primarily derived from English translations, these benchmarks are constructed based on real-world scenarios from SEA regions. SeaExam draws from regional educational exams to form a comprehensive dataset that encompasses subjects such as local history and literature. In contrast, SeaBench is crafted around multi-turn, open-ended tasks that reflect daily interactions within SEA communities. Our evaluations demonstrate that SeaExam and SeaBench more effectively discern LLM performance on SEA language tasks compared to their translated benchmarks. This highlights the importance of using real-world queries to assess the multilingual capabilities of LLMs.
%R 10.18653/v1/2025.findings-naacl.341
%U https://aclanthology.org/2025.findings-naacl.341/
%U https://doi.org/10.18653/v1/2025.findings-naacl.341
%P 6119-6136
Markdown (Informal)
[SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia](https://aclanthology.org/2025.findings-naacl.341/) (Liu et al., Findings 2025)
ACL