@inproceedings{lin-etal-2024-indocl,
title = "{I}ndo{CL}: Benchmarking {I}ndonesian Language Development Assessment",
author = "Lin, Nankai and
Wu, Hongyan and
Zheng, Weixiong and
Liao, Xingming and
Jiang, Shengyi and
Yang, Aimin and
Xiao, Lixian",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.280",
pages = "4873--4885",
abstract = "Recently, the field of language acquisition (LA) has significantly benefited from natural language processing technologies. A crucial task in LA involves tracking the evolution of language learners{'} competence, namely language development assessment (LDA). However, the majority of LDA research focuses on high-resource languages, with limited attention directed toward low-resource languages. Moreover, existing methodologies primarily depend on linguistic rules and language characteristics, with a limited exploration of exploiting pre-trained language models (PLMs) for LDA. In this paper, we construct the IndoCL corpus (Indonesian Corpus of L2 Learners), which comprises compositions written by undergraduate students majoring in Indonesian language. Moreover, we propose a model for LDA tasks, which automatically extracts language-independent features, relieving laborious computation and reliance on specific language. The proposed model uses sequential information attention and similarity representation learning to capture the differences and common information from the first-written and second-written essays, respectively. It has demonstrated remarkable performance on both our self-constructed corpus and publicly available corpora. Our work could serve as a novel benchmark for Indonesian LDA tasks. We also explore the feasibility of using existing large-scale language models (LLMs) for LDA tasks. The results show significant potential for improving LLM performance in LDA tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2024-indocl">
<titleInfo>
<title>IndoCL: Benchmarking Indonesian Language Development Assessment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nankai</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weixiong</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingming</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengyi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aimin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lixian</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, the field of language acquisition (LA) has significantly benefited from natural language processing technologies. A crucial task in LA involves tracking the evolution of language learners’ competence, namely language development assessment (LDA). However, the majority of LDA research focuses on high-resource languages, with limited attention directed toward low-resource languages. Moreover, existing methodologies primarily depend on linguistic rules and language characteristics, with a limited exploration of exploiting pre-trained language models (PLMs) for LDA. In this paper, we construct the IndoCL corpus (Indonesian Corpus of L2 Learners), which comprises compositions written by undergraduate students majoring in Indonesian language. Moreover, we propose a model for LDA tasks, which automatically extracts language-independent features, relieving laborious computation and reliance on specific language. The proposed model uses sequential information attention and similarity representation learning to capture the differences and common information from the first-written and second-written essays, respectively. It has demonstrated remarkable performance on both our self-constructed corpus and publicly available corpora. Our work could serve as a novel benchmark for Indonesian LDA tasks. We also explore the feasibility of using existing large-scale language models (LLMs) for LDA tasks. The results show significant potential for improving LLM performance in LDA tasks.</abstract>
<identifier type="citekey">lin-etal-2024-indocl</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.280</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>4873</start>
<end>4885</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IndoCL: Benchmarking Indonesian Language Development Assessment
%A Lin, Nankai
%A Wu, Hongyan
%A Zheng, Weixiong
%A Liao, Xingming
%A Jiang, Shengyi
%A Yang, Aimin
%A Xiao, Lixian
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F lin-etal-2024-indocl
%X Recently, the field of language acquisition (LA) has significantly benefited from natural language processing technologies. A crucial task in LA involves tracking the evolution of language learners’ competence, namely language development assessment (LDA). However, the majority of LDA research focuses on high-resource languages, with limited attention directed toward low-resource languages. Moreover, existing methodologies primarily depend on linguistic rules and language characteristics, with a limited exploration of exploiting pre-trained language models (PLMs) for LDA. In this paper, we construct the IndoCL corpus (Indonesian Corpus of L2 Learners), which comprises compositions written by undergraduate students majoring in Indonesian language. Moreover, we propose a model for LDA tasks, which automatically extracts language-independent features, relieving laborious computation and reliance on specific language. The proposed model uses sequential information attention and similarity representation learning to capture the differences and common information from the first-written and second-written essays, respectively. It has demonstrated remarkable performance on both our self-constructed corpus and publicly available corpora. Our work could serve as a novel benchmark for Indonesian LDA tasks. We also explore the feasibility of using existing large-scale language models (LLMs) for LDA tasks. The results show significant potential for improving LLM performance in LDA tasks.
%U https://aclanthology.org/2024.findings-emnlp.280
%P 4873-4885
Markdown (Informal)
[IndoCL: Benchmarking Indonesian Language Development Assessment](https://aclanthology.org/2024.findings-emnlp.280) (Lin et al., Findings 2024)
ACL
- Nankai Lin, Hongyan Wu, Weixiong Zheng, Xingming Liao, Shengyi Jiang, Aimin Yang, and Lixian Xiao. 2024. IndoCL: Benchmarking Indonesian Language Development Assessment. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 4873–4885, Miami, Florida, USA. Association for Computational Linguistics.