@inproceedings{du-nguyen-2023-measuring,
title = "Measuring the Instability of Fine-Tuning",
author = "Du, Yupei and
Nguyen, Dong",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.342",
doi = "10.18653/v1/2023.acl-long.342",
pages = "6209--6230",
abstract = "Fine-tuning pre-trained language models on downstream tasks with varying random seeds has been shown to be unstable, especially on small datasets. Many previous studies have investigated this instability and proposed methods to mitigate it. However, most of these studies only used the standard deviation of performance scores (SD) as their measure, which is a narrow characterization of instability. In this paper, we analyze SD and six other measures quantifying instability of different granularity levels. Moreover, we propose a systematic evaluation framework of these measures{'} validity. Finally, we analyze the consistency and difference between different measures by reassessing existing instability mitigation methods. We hope our results will inform better measurements of the fine-tuning instability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-nguyen-2023-measuring">
<titleInfo>
<title>Measuring the Instability of Fine-Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yupei</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Fine-tuning pre-trained language models on downstream tasks with varying random seeds has been shown to be unstable, especially on small datasets. Many previous studies have investigated this instability and proposed methods to mitigate it. However, most of these studies only used the standard deviation of performance scores (SD) as their measure, which is a narrow characterization of instability. In this paper, we analyze SD and six other measures quantifying instability of different granularity levels. Moreover, we propose a systematic evaluation framework of these measures’ validity. Finally, we analyze the consistency and difference between different measures by reassessing existing instability mitigation methods. We hope our results will inform better measurements of the fine-tuning instability.</abstract>
<identifier type="citekey">du-nguyen-2023-measuring</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.342</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.342</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>6209</start>
<end>6230</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring the Instability of Fine-Tuning
%A Du, Yupei
%A Nguyen, Dong
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F du-nguyen-2023-measuring
%X Fine-tuning pre-trained language models on downstream tasks with varying random seeds has been shown to be unstable, especially on small datasets. Many previous studies have investigated this instability and proposed methods to mitigate it. However, most of these studies only used the standard deviation of performance scores (SD) as their measure, which is a narrow characterization of instability. In this paper, we analyze SD and six other measures quantifying instability of different granularity levels. Moreover, we propose a systematic evaluation framework of these measures’ validity. Finally, we analyze the consistency and difference between different measures by reassessing existing instability mitigation methods. We hope our results will inform better measurements of the fine-tuning instability.
%R 10.18653/v1/2023.acl-long.342
%U https://aclanthology.org/2023.acl-long.342
%U https://doi.org/10.18653/v1/2023.acl-long.342
%P 6209-6230
Markdown (Informal)
[Measuring the Instability of Fine-Tuning](https://aclanthology.org/2023.acl-long.342) (Du & Nguyen, ACL 2023)
ACL
- Yupei Du and Dong Nguyen. 2023. Measuring the Instability of Fine-Tuning. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6209–6230, Toronto, Canada. Association for Computational Linguistics.