@inproceedings{cheng-etal-2023-decouple,
title = "Decouple knowledge from paramters for plug-and-play language modeling",
author = "Cheng, Xin and
Lin, Yankai and
Chen, Xiuying and
Zhao, Dongyan and
Yan, Rui",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.901",
doi = "10.18653/v1/2023.findings-acl.901",
pages = "14288--14308",
abstract = "Pre-trained language models (PLM) have made impressive results in a wide range of NLP tasks and it has been revealed that one of the key factors to their success is the parameters of these models implicitly learn various types of knowledge in the pre-training corpus. However, encoding knowledge implicitly in the model parameters has two fundamental drawbacks. First, the knowledge is neither editable nor scalable once the model is trained, which is especially problematic in that knowledge is consistently evolving. Second, it lacks interpretability and prevents us from understanding what kind of knowledge PLM needs to solve a certain task. In this paper, we introduce {pasted macro {`}MODEL{'}}, a pre-training model with differentiable plug-in memory (DPM). The key intuition behind is to decouple the knowledge storage from model parameters with an editable and scalable key-value memory and leverage knowledge in an explainable manner by knowledge retrieval in the {pasted macro {`}MEMORY{'}}. We conduct extensive experiments under various settings to justify this design choice. In domain adaptation setting, {pasted macro {`}MODEL{'}} could be easily adapted to different domains with pluggable in-domain memory{---}obtaining 3.95 F1 improvements across four domains, without any in-domain training. {pasted macro {`}MODEL{'}} could also keep absorbing new knowledge after pre-training is done by knowledge updating operation in the {pasted macro {`}MEMORY{'}} without re-training. Finally, we show that by incorporating training samples into {pasted macro {`}MEMORY{'}} with knowledge prompting, {pasted macro {`}MODEL{'}} could further be improved by the instruction of in-task knowledge.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cheng-etal-2023-decouple">
<titleInfo>
<title>Decouple knowledge from paramters for plug-and-play language modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yankai</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiuying</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained language models (PLM) have made impressive results in a wide range of NLP tasks and it has been revealed that one of the key factors to their success is the parameters of these models implicitly learn various types of knowledge in the pre-training corpus. However, encoding knowledge implicitly in the model parameters has two fundamental drawbacks. First, the knowledge is neither editable nor scalable once the model is trained, which is especially problematic in that knowledge is consistently evolving. Second, it lacks interpretability and prevents us from understanding what kind of knowledge PLM needs to solve a certain task. In this paper, we introduce pasted macro ‘MODEL’, a pre-training model with differentiable plug-in memory (DPM). The key intuition behind is to decouple the knowledge storage from model parameters with an editable and scalable key-value memory and leverage knowledge in an explainable manner by knowledge retrieval in the pasted macro ‘MEMORY’. We conduct extensive experiments under various settings to justify this design choice. In domain adaptation setting, pasted macro ‘MODEL’ could be easily adapted to different domains with pluggable in-domain memory—obtaining 3.95 F1 improvements across four domains, without any in-domain training. pasted macro ‘MODEL’ could also keep absorbing new knowledge after pre-training is done by knowledge updating operation in the pasted macro ‘MEMORY’ without re-training. Finally, we show that by incorporating training samples into pasted macro ‘MEMORY’ with knowledge prompting, pasted macro ‘MODEL’ could further be improved by the instruction of in-task knowledge.</abstract>
<identifier type="citekey">cheng-etal-2023-decouple</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.901</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.901</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>14288</start>
<end>14308</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Decouple knowledge from paramters for plug-and-play language modeling
%A Cheng, Xin
%A Lin, Yankai
%A Chen, Xiuying
%A Zhao, Dongyan
%A Yan, Rui
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F cheng-etal-2023-decouple
%X Pre-trained language models (PLM) have made impressive results in a wide range of NLP tasks and it has been revealed that one of the key factors to their success is the parameters of these models implicitly learn various types of knowledge in the pre-training corpus. However, encoding knowledge implicitly in the model parameters has two fundamental drawbacks. First, the knowledge is neither editable nor scalable once the model is trained, which is especially problematic in that knowledge is consistently evolving. Second, it lacks interpretability and prevents us from understanding what kind of knowledge PLM needs to solve a certain task. In this paper, we introduce pasted macro ‘MODEL’, a pre-training model with differentiable plug-in memory (DPM). The key intuition behind is to decouple the knowledge storage from model parameters with an editable and scalable key-value memory and leverage knowledge in an explainable manner by knowledge retrieval in the pasted macro ‘MEMORY’. We conduct extensive experiments under various settings to justify this design choice. In domain adaptation setting, pasted macro ‘MODEL’ could be easily adapted to different domains with pluggable in-domain memory—obtaining 3.95 F1 improvements across four domains, without any in-domain training. pasted macro ‘MODEL’ could also keep absorbing new knowledge after pre-training is done by knowledge updating operation in the pasted macro ‘MEMORY’ without re-training. Finally, we show that by incorporating training samples into pasted macro ‘MEMORY’ with knowledge prompting, pasted macro ‘MODEL’ could further be improved by the instruction of in-task knowledge.
%R 10.18653/v1/2023.findings-acl.901
%U https://aclanthology.org/2023.findings-acl.901
%U https://doi.org/10.18653/v1/2023.findings-acl.901
%P 14288-14308
Markdown (Informal)
[Decouple knowledge from paramters for plug-and-play language modeling](https://aclanthology.org/2023.findings-acl.901) (Cheng et al., Findings 2023)
ACL