@inproceedings{palavalli-etal-2024-taxonomy,
title = "A Taxonomy for Data Contamination in Large Language Models",
author = "Palavalli, Medha and
Bertsch, Amanda and
Gormley, Matthew",
editor = "Sainz, Oscar and
Garc{\'\i}a Ferrero, Iker and
Agirre, Eneko and
Ander Campos, Jon and
Jacovi, Alon and
Elazar, Yanai and
Goldberg, Yoav",
booktitle = "Proceedings of the 1st Workshop on Data Contamination (CONDA)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conda-1.3",
pages = "22--40",
abstract = "Large language models pretrained on extensive web corpora demonstrate remarkable performance across a wide range of downstream tasks. However, a growing concern is data contamination, where evaluation datasets may unintentionally be contained in the pretraining corpus, inflating model performance. Decontamination, the process of detecting and removing such data, is a potential solution; yet these contaminants may originate from altered versions of the test set, evading detection during decontamination. How different types of contamination impact the performance of language models on downstream tasks is not fully understood. We present a taxonomy that categorizes the various types of contamination encountered by LLMs during the pretraining phase and identify which types pose the highest risk. We analyze the impact of contamination on two key NLP tasks{---}summarization and question answering{---}revealing how different types of contamination influence task performance during evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palavalli-etal-2024-taxonomy">
<titleInfo>
<title>A Taxonomy for Data Contamination in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Medha</namePart>
<namePart type="family">Palavalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="family">Bertsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Gormley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Data Contamination (CONDA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oscar</namePart>
<namePart type="family">Sainz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iker</namePart>
<namePart type="family">García Ferrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Ander Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Jacovi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanai</namePart>
<namePart type="family">Elazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models pretrained on extensive web corpora demonstrate remarkable performance across a wide range of downstream tasks. However, a growing concern is data contamination, where evaluation datasets may unintentionally be contained in the pretraining corpus, inflating model performance. Decontamination, the process of detecting and removing such data, is a potential solution; yet these contaminants may originate from altered versions of the test set, evading detection during decontamination. How different types of contamination impact the performance of language models on downstream tasks is not fully understood. We present a taxonomy that categorizes the various types of contamination encountered by LLMs during the pretraining phase and identify which types pose the highest risk. We analyze the impact of contamination on two key NLP tasks—summarization and question answering—revealing how different types of contamination influence task performance during evaluation.</abstract>
<identifier type="citekey">palavalli-etal-2024-taxonomy</identifier>
<location>
<url>https://aclanthology.org/2024.conda-1.3</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>22</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Taxonomy for Data Contamination in Large Language Models
%A Palavalli, Medha
%A Bertsch, Amanda
%A Gormley, Matthew
%Y Sainz, Oscar
%Y García Ferrero, Iker
%Y Agirre, Eneko
%Y Ander Campos, Jon
%Y Jacovi, Alon
%Y Elazar, Yanai
%Y Goldberg, Yoav
%S Proceedings of the 1st Workshop on Data Contamination (CONDA)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F palavalli-etal-2024-taxonomy
%X Large language models pretrained on extensive web corpora demonstrate remarkable performance across a wide range of downstream tasks. However, a growing concern is data contamination, where evaluation datasets may unintentionally be contained in the pretraining corpus, inflating model performance. Decontamination, the process of detecting and removing such data, is a potential solution; yet these contaminants may originate from altered versions of the test set, evading detection during decontamination. How different types of contamination impact the performance of language models on downstream tasks is not fully understood. We present a taxonomy that categorizes the various types of contamination encountered by LLMs during the pretraining phase and identify which types pose the highest risk. We analyze the impact of contamination on two key NLP tasks—summarization and question answering—revealing how different types of contamination influence task performance during evaluation.
%U https://aclanthology.org/2024.conda-1.3
%P 22-40
Markdown (Informal)
[A Taxonomy for Data Contamination in Large Language Models](https://aclanthology.org/2024.conda-1.3) (Palavalli et al., CONDA-WS 2024)
ACL