@inproceedings{mehrbakhsh-etal-2024-confounders,
title = "Confounders in Instance Variation for the Analysis of Data Contamination",
author = "Mehrbakhsh, Behzad and
Garigliotti, Dario and
Mart{\'\i}nez-Plumed, Fernando and
Hernandez-Orallo, Jose",
editor = "Sainz, Oscar and
Garc{\'\i}a Ferrero, Iker and
Agirre, Eneko and
Ander Campos, Jon and
Jacovi, Alon and
Elazar, Yanai and
Goldberg, Yoav",
booktitle = "Proceedings of the 1st Workshop on Data Contamination (CONDA)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conda-1.2",
doi = "10.18653/v1/2024.conda-1.2",
pages = "13--21",
abstract = "Test contamination is a serious problem for the evaluation of large language models (LLMs) because it leads to the overestimation of their performance and a quick saturation of benchmarks, even before the actual capability is achieved. One strategy to address this issue is the (adversarial) generation of variations, by including different exemplars and different rephrasings of the questions. However, these two interventions can lead to instances that can be more difficult (accumulating on the expected loss of performance by partly removing the contamination) but also to instances that can be less difficult (cancelling the expected loss of performance), which would make contamination undetectable. Understanding these two phenomena in terms of instance difficulty is critical to determine and measure contamination. In this paper we conduct a comprehensive analysis of these two interventions on an addition task with fine-tuned LLAMA-2 models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mehrbakhsh-etal-2024-confounders">
<titleInfo>
<title>Confounders in Instance Variation for the Analysis of Data Contamination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Behzad</namePart>
<namePart type="family">Mehrbakhsh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dario</namePart>
<namePart type="family">Garigliotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Martínez-Plumed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="family">Hernandez-Orallo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Data Contamination (CONDA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oscar</namePart>
<namePart type="family">Sainz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iker</namePart>
<namePart type="family">García Ferrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Ander Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Jacovi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanai</namePart>
<namePart type="family">Elazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Test contamination is a serious problem for the evaluation of large language models (LLMs) because it leads to the overestimation of their performance and a quick saturation of benchmarks, even before the actual capability is achieved. One strategy to address this issue is the (adversarial) generation of variations, by including different exemplars and different rephrasings of the questions. However, these two interventions can lead to instances that can be more difficult (accumulating on the expected loss of performance by partly removing the contamination) but also to instances that can be less difficult (cancelling the expected loss of performance), which would make contamination undetectable. Understanding these two phenomena in terms of instance difficulty is critical to determine and measure contamination. In this paper we conduct a comprehensive analysis of these two interventions on an addition task with fine-tuned LLAMA-2 models.</abstract>
<identifier type="citekey">mehrbakhsh-etal-2024-confounders</identifier>
<identifier type="doi">10.18653/v1/2024.conda-1.2</identifier>
<location>
<url>https://aclanthology.org/2024.conda-1.2</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>13</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Confounders in Instance Variation for the Analysis of Data Contamination
%A Mehrbakhsh, Behzad
%A Garigliotti, Dario
%A Martínez-Plumed, Fernando
%A Hernandez-Orallo, Jose
%Y Sainz, Oscar
%Y García Ferrero, Iker
%Y Agirre, Eneko
%Y Ander Campos, Jon
%Y Jacovi, Alon
%Y Elazar, Yanai
%Y Goldberg, Yoav
%S Proceedings of the 1st Workshop on Data Contamination (CONDA)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F mehrbakhsh-etal-2024-confounders
%X Test contamination is a serious problem for the evaluation of large language models (LLMs) because it leads to the overestimation of their performance and a quick saturation of benchmarks, even before the actual capability is achieved. One strategy to address this issue is the (adversarial) generation of variations, by including different exemplars and different rephrasings of the questions. However, these two interventions can lead to instances that can be more difficult (accumulating on the expected loss of performance by partly removing the contamination) but also to instances that can be less difficult (cancelling the expected loss of performance), which would make contamination undetectable. Understanding these two phenomena in terms of instance difficulty is critical to determine and measure contamination. In this paper we conduct a comprehensive analysis of these two interventions on an addition task with fine-tuned LLAMA-2 models.
%R 10.18653/v1/2024.conda-1.2
%U https://aclanthology.org/2024.conda-1.2
%U https://doi.org/10.18653/v1/2024.conda-1.2
%P 13-21
Markdown (Informal)
[Confounders in Instance Variation for the Analysis of Data Contamination](https://aclanthology.org/2024.conda-1.2) (Mehrbakhsh et al., CONDA-WS 2024)
ACL