@inproceedings{miura-etal-2024-japanese,
title = "{J}apanese-{E}nglish Sentence Translation Exercises Dataset for Automatic Grading",
author = "Miura, Naoki and
Funayama, Hiroaki and
Kikuchi, Seiya and
Matsubayashi, Yuichiroh and
Iwase, Yuya and
Inui, Kentaro",
editor = "Falk, Neele and
Papi, Sara and
Zhang, Mike",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-srw.21",
pages = "266--278",
abstract = "This paper proposes the task of automatic assessment of Sentence Translation Exercises (STEs), that have been used in the early stage of L2 language learning.We formalize the task as grading student responses for each rubric criterion pre-specified by the educators.We then create a dataset for STE between Japanese and English including 21 questions, along with a total of 3,498 student responses (167 on average).The answer responses were collected from students and crowd workers.Using this dataset, we demonstrate the performance of baselines including a finetuned BERT model and GPT-3.5 with few-shot learning. Experimental results showed that the baseline model with fine-tuned BERT was able to classify correct responses with approximately 90{\%} in $F_1$, but only less than 80{\%} for incorrect responses. Furthermore, GPT-3.5 with few-shot learning shows a poorer result than the BERT model, indicating that our newly proposed task presents a challenging issue, even for the state-of-the-art large language model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miura-etal-2024-japanese">
<titleInfo>
<title>Japanese-English Sentence Translation Exercises Dataset for Automatic Grading</title>
</titleInfo>
<name type="personal">
<namePart type="given">Naoki</namePart>
<namePart type="family">Miura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroaki</namePart>
<namePart type="family">Funayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seiya</namePart>
<namePart type="family">Kikuchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuichiroh</namePart>
<namePart type="family">Matsubayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuya</namePart>
<namePart type="family">Iwase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neele</namePart>
<namePart type="family">Falk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Papi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper proposes the task of automatic assessment of Sentence Translation Exercises (STEs), that have been used in the early stage of L2 language learning.We formalize the task as grading student responses for each rubric criterion pre-specified by the educators.We then create a dataset for STE between Japanese and English including 21 questions, along with a total of 3,498 student responses (167 on average).The answer responses were collected from students and crowd workers.Using this dataset, we demonstrate the performance of baselines including a finetuned BERT model and GPT-3.5 with few-shot learning. Experimental results showed that the baseline model with fine-tuned BERT was able to classify correct responses with approximately 90% in F₁, but only less than 80% for incorrect responses. Furthermore, GPT-3.5 with few-shot learning shows a poorer result than the BERT model, indicating that our newly proposed task presents a challenging issue, even for the state-of-the-art large language model.</abstract>
<identifier type="citekey">miura-etal-2024-japanese</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-srw.21</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>266</start>
<end>278</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Japanese-English Sentence Translation Exercises Dataset for Automatic Grading
%A Miura, Naoki
%A Funayama, Hiroaki
%A Kikuchi, Seiya
%A Matsubayashi, Yuichiroh
%A Iwase, Yuya
%A Inui, Kentaro
%Y Falk, Neele
%Y Papi, Sara
%Y Zhang, Mike
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F miura-etal-2024-japanese
%X This paper proposes the task of automatic assessment of Sentence Translation Exercises (STEs), that have been used in the early stage of L2 language learning.We formalize the task as grading student responses for each rubric criterion pre-specified by the educators.We then create a dataset for STE between Japanese and English including 21 questions, along with a total of 3,498 student responses (167 on average).The answer responses were collected from students and crowd workers.Using this dataset, we demonstrate the performance of baselines including a finetuned BERT model and GPT-3.5 with few-shot learning. Experimental results showed that the baseline model with fine-tuned BERT was able to classify correct responses with approximately 90% in F₁, but only less than 80% for incorrect responses. Furthermore, GPT-3.5 with few-shot learning shows a poorer result than the BERT model, indicating that our newly proposed task presents a challenging issue, even for the state-of-the-art large language model.
%U https://aclanthology.org/2024.eacl-srw.21
%P 266-278
Markdown (Informal)
[Japanese-English Sentence Translation Exercises Dataset for Automatic Grading](https://aclanthology.org/2024.eacl-srw.21) (Miura et al., EACL 2024)
ACL
- Naoki Miura, Hiroaki Funayama, Seiya Kikuchi, Yuichiroh Matsubayashi, Yuya Iwase, and Kentaro Inui. 2024. Japanese-English Sentence Translation Exercises Dataset for Automatic Grading. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, pages 266–278, St. Julian’s, Malta. Association for Computational Linguistics.