@inproceedings{kimura-etal-2024-l3masking,
title = "{L}3{M}asking: Multi-task Fine-tuning for Language Models by Leveraging Lessons Learned from Vanilla Models",
author = "Kimura, Yusuke and
Komamizu, Takahiro and
Hatano, Kenji",
editor = "Kumar, Sachin and
Balachandran, Vidhisha and
Park, Chan Young and
Shi, Weijia and
Hayati, Shirley Anugrah and
Tsvetkov, Yulia and
Smith, Noah and
Hajishirzi, Hannaneh and
Kang, Dongyeop and
Jurgens, David",
booktitle = "Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.customnlp4u-1.6",
pages = "53--62",
abstract = "When distributional differences exist between pre-training and fine-tuning data, language models (LMs) may perform poorly on downstream tasks.Recent studies have reported that multi-task learning of downstream task and masked language modeling (MLM) task during the fine-tuning phase improves the performance of the downstream task.Typical MLM tasks (e.g., random token masking (RTM)) tend not to care tokens corresponding to the knowledge already acquired during the pre-training phase, therefore LMs may not notice the important clue or not effective to acquire linguistic knowledge of the task or domain.To overcome this limitation, we propose a new masking strategy for MLM task, called L3Masking, that leverages lessons (specifically, token-wise likelihood in a context) learned from the vanilla language model to be fine-tuned.L3Masking actively masks tokens with low likelihood on the vanilla model.Experimental evaluations on text classification tasks in different domains confirms a multi-task text classification method with L3Masking performed task adaptation more effectively than that with RTM.These results suggest the usefulness of assigning a preference to the tokens to be learned as the task or domain adaptation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kimura-etal-2024-l3masking">
<titleInfo>
<title>L3Masking: Multi-task Fine-tuning for Language Models by Leveraging Lessons Learned from Vanilla Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Kimura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takahiro</namePart>
<namePart type="family">Komamizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenji</namePart>
<namePart type="family">Hatano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vidhisha</namePart>
<namePart type="family">Balachandran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chan</namePart>
<namePart type="given">Young</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weijia</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shirley</namePart>
<namePart type="given">Anugrah</namePart>
<namePart type="family">Hayati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyeop</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When distributional differences exist between pre-training and fine-tuning data, language models (LMs) may perform poorly on downstream tasks.Recent studies have reported that multi-task learning of downstream task and masked language modeling (MLM) task during the fine-tuning phase improves the performance of the downstream task.Typical MLM tasks (e.g., random token masking (RTM)) tend not to care tokens corresponding to the knowledge already acquired during the pre-training phase, therefore LMs may not notice the important clue or not effective to acquire linguistic knowledge of the task or domain.To overcome this limitation, we propose a new masking strategy for MLM task, called L3Masking, that leverages lessons (specifically, token-wise likelihood in a context) learned from the vanilla language model to be fine-tuned.L3Masking actively masks tokens with low likelihood on the vanilla model.Experimental evaluations on text classification tasks in different domains confirms a multi-task text classification method with L3Masking performed task adaptation more effectively than that with RTM.These results suggest the usefulness of assigning a preference to the tokens to be learned as the task or domain adaptation.</abstract>
<identifier type="citekey">kimura-etal-2024-l3masking</identifier>
<location>
<url>https://aclanthology.org/2024.customnlp4u-1.6</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>53</start>
<end>62</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T L3Masking: Multi-task Fine-tuning for Language Models by Leveraging Lessons Learned from Vanilla Models
%A Kimura, Yusuke
%A Komamizu, Takahiro
%A Hatano, Kenji
%Y Kumar, Sachin
%Y Balachandran, Vidhisha
%Y Park, Chan Young
%Y Shi, Weijia
%Y Hayati, Shirley Anugrah
%Y Tsvetkov, Yulia
%Y Smith, Noah
%Y Hajishirzi, Hannaneh
%Y Kang, Dongyeop
%Y Jurgens, David
%S Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kimura-etal-2024-l3masking
%X When distributional differences exist between pre-training and fine-tuning data, language models (LMs) may perform poorly on downstream tasks.Recent studies have reported that multi-task learning of downstream task and masked language modeling (MLM) task during the fine-tuning phase improves the performance of the downstream task.Typical MLM tasks (e.g., random token masking (RTM)) tend not to care tokens corresponding to the knowledge already acquired during the pre-training phase, therefore LMs may not notice the important clue or not effective to acquire linguistic knowledge of the task or domain.To overcome this limitation, we propose a new masking strategy for MLM task, called L3Masking, that leverages lessons (specifically, token-wise likelihood in a context) learned from the vanilla language model to be fine-tuned.L3Masking actively masks tokens with low likelihood on the vanilla model.Experimental evaluations on text classification tasks in different domains confirms a multi-task text classification method with L3Masking performed task adaptation more effectively than that with RTM.These results suggest the usefulness of assigning a preference to the tokens to be learned as the task or domain adaptation.
%U https://aclanthology.org/2024.customnlp4u-1.6
%P 53-62
Markdown (Informal)
[L3Masking: Multi-task Fine-tuning for Language Models by Leveraging Lessons Learned from Vanilla Models](https://aclanthology.org/2024.customnlp4u-1.6) (Kimura et al., CustomNLP4U 2024)
ACL