@inproceedings{liu-etal-2024-evaluating,
title = "Evaluating {C}hinese Large Language Models on Discipline Knowledge Acquisition via Memorization and Robustness Assessment",
author = "Liu, Chuang and
Jin, Renren and
Steedman, Mark and
Xiong, Deyi",
editor = "Sainz, Oscar and
Garc{\'\i}a Ferrero, Iker and
Agirre, Eneko and
Ander Campos, Jon and
Jacovi, Alon and
Elazar, Yanai and
Goldberg, Yoav",
booktitle = "Proceedings of the 1st Workshop on Data Contamination (CONDA)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conda-1.1",
doi = "10.18653/v1/2024.conda-1.1",
pages = "1--12",
abstract = "Chinese LLMs demonstrate impressive performance on NLP tasks, particularly on discipline knowledge benchmarks, with some results approaching those of GPT-4. Previous research has viewed these advancements as potential outcomes of data contamination or leakage, prompting efforts to create new detection methods and address evaluation issues in LLM benchmarks. However, there has been a lack of comprehensive assessment of the evolution of Chinese LLMs. To address this gap, this paper offers a thorough investigation of Chinese LLMs on discipline knowledge evaluation, delving into the advancements of various LLMs, including a group of related models and others. Specifically, we have conducted six assessments ranging from knowledge memorization to comprehension for robustness, encompassing tasks like predicting incomplete questions and options, identifying behaviors by the contaminational fine-tuning, and answering rephrased questions. Experimental findings indicate a positive correlation between the release time of LLMs and their memorization capabilities, but they struggle with variations in original question-options pairs. Additionally, our findings suggest that question descriptions have a more significant impact on LLMs{'} performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2024-evaluating">
<titleInfo>
<title>Evaluating Chinese Large Language Models on Discipline Knowledge Acquisition via Memorization and Robustness Assessment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chuang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renren</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Steedman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deyi</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Data Contamination (CONDA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oscar</namePart>
<namePart type="family">Sainz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iker</namePart>
<namePart type="family">García Ferrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Ander Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Jacovi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanai</namePart>
<namePart type="family">Elazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Chinese LLMs demonstrate impressive performance on NLP tasks, particularly on discipline knowledge benchmarks, with some results approaching those of GPT-4. Previous research has viewed these advancements as potential outcomes of data contamination or leakage, prompting efforts to create new detection methods and address evaluation issues in LLM benchmarks. However, there has been a lack of comprehensive assessment of the evolution of Chinese LLMs. To address this gap, this paper offers a thorough investigation of Chinese LLMs on discipline knowledge evaluation, delving into the advancements of various LLMs, including a group of related models and others. Specifically, we have conducted six assessments ranging from knowledge memorization to comprehension for robustness, encompassing tasks like predicting incomplete questions and options, identifying behaviors by the contaminational fine-tuning, and answering rephrased questions. Experimental findings indicate a positive correlation between the release time of LLMs and their memorization capabilities, but they struggle with variations in original question-options pairs. Additionally, our findings suggest that question descriptions have a more significant impact on LLMs’ performance.</abstract>
<identifier type="citekey">liu-etal-2024-evaluating</identifier>
<identifier type="doi">10.18653/v1/2024.conda-1.1</identifier>
<location>
<url>https://aclanthology.org/2024.conda-1.1</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>1</start>
<end>12</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Chinese Large Language Models on Discipline Knowledge Acquisition via Memorization and Robustness Assessment
%A Liu, Chuang
%A Jin, Renren
%A Steedman, Mark
%A Xiong, Deyi
%Y Sainz, Oscar
%Y García Ferrero, Iker
%Y Agirre, Eneko
%Y Ander Campos, Jon
%Y Jacovi, Alon
%Y Elazar, Yanai
%Y Goldberg, Yoav
%S Proceedings of the 1st Workshop on Data Contamination (CONDA)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F liu-etal-2024-evaluating
%X Chinese LLMs demonstrate impressive performance on NLP tasks, particularly on discipline knowledge benchmarks, with some results approaching those of GPT-4. Previous research has viewed these advancements as potential outcomes of data contamination or leakage, prompting efforts to create new detection methods and address evaluation issues in LLM benchmarks. However, there has been a lack of comprehensive assessment of the evolution of Chinese LLMs. To address this gap, this paper offers a thorough investigation of Chinese LLMs on discipline knowledge evaluation, delving into the advancements of various LLMs, including a group of related models and others. Specifically, we have conducted six assessments ranging from knowledge memorization to comprehension for robustness, encompassing tasks like predicting incomplete questions and options, identifying behaviors by the contaminational fine-tuning, and answering rephrased questions. Experimental findings indicate a positive correlation between the release time of LLMs and their memorization capabilities, but they struggle with variations in original question-options pairs. Additionally, our findings suggest that question descriptions have a more significant impact on LLMs’ performance.
%R 10.18653/v1/2024.conda-1.1
%U https://aclanthology.org/2024.conda-1.1
%U https://doi.org/10.18653/v1/2024.conda-1.1
%P 1-12
Markdown (Informal)
[Evaluating Chinese Large Language Models on Discipline Knowledge Acquisition via Memorization and Robustness Assessment](https://aclanthology.org/2024.conda-1.1) (Liu et al., CONDA-WS 2024)
ACL