@inproceedings{ton-etal-2022-regularized,
title = "Regularized Training of Nearest Neighbor Language Models",
author = "Ton, Jean-Francois and
Talbott, Walter and
Zhai, Shuangfei and
Susskind, Joshua M.",
editor = "Ippolito, Daphne and
Li, Liunian Harold and
Pacheco, Maria Leonor and
Chen, Danqi and
Xue, Nianwen",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop",
month = jul,
year = "2022",
address = "Hybrid: Seattle, Washington + Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-srw.4",
doi = "10.18653/v1/2022.naacl-srw.4",
pages = "25--30",
abstract = "Including memory banks in a natural language processing architecture increases model capacity by equipping it with additional data at inference time. In this paper, we build upon $k$NN-LM (CITATION), which uses a pre-trained language model together with an exhaustive $k$NN search through the training data (memory bank) to achieve state-of-the-art results. We investigate whether we can improve the $k$NN-LM performance by instead training a LM with the knowledge that we will be using a $k$NN post-hoc. We achieved significant improvement using our method on language modeling tasks on WIKI-2 and WIKI-103. The main phenomenon that we encounter is that adding a simple L2 regularization on the activations (not weights) of the model, a transformer, improves the post-hoc $k$NN classification performance. We explore some possible reasons for this improvement. In particular, we find that the added L2 regularization seems to improve the performance for high-frequency words without deteriorating the performance for low frequency ones.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ton-etal-2022-regularized">
<titleInfo>
<title>Regularized Training of Nearest Neighbor Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jean-Francois</namePart>
<namePart type="family">Ton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Talbott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuangfei</namePart>
<namePart type="family">Zhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Susskind</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liunian</namePart>
<namePart type="given">Harold</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Leonor</namePart>
<namePart type="family">Pacheco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danqi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid: Seattle, Washington + Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Including memory banks in a natural language processing architecture increases model capacity by equipping it with additional data at inference time. In this paper, we build upon kNN-LM (CITATION), which uses a pre-trained language model together with an exhaustive kNN search through the training data (memory bank) to achieve state-of-the-art results. We investigate whether we can improve the kNN-LM performance by instead training a LM with the knowledge that we will be using a kNN post-hoc. We achieved significant improvement using our method on language modeling tasks on WIKI-2 and WIKI-103. The main phenomenon that we encounter is that adding a simple L2 regularization on the activations (not weights) of the model, a transformer, improves the post-hoc kNN classification performance. We explore some possible reasons for this improvement. In particular, we find that the added L2 regularization seems to improve the performance for high-frequency words without deteriorating the performance for low frequency ones.</abstract>
<identifier type="citekey">ton-etal-2022-regularized</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-srw.4</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-srw.4</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>25</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Regularized Training of Nearest Neighbor Language Models
%A Ton, Jean-Francois
%A Talbott, Walter
%A Zhai, Shuangfei
%A Susskind, Joshua M.
%Y Ippolito, Daphne
%Y Li, Liunian Harold
%Y Pacheco, Maria Leonor
%Y Chen, Danqi
%Y Xue, Nianwen
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop
%D 2022
%8 July
%I Association for Computational Linguistics
%C Hybrid: Seattle, Washington + Online
%F ton-etal-2022-regularized
%X Including memory banks in a natural language processing architecture increases model capacity by equipping it with additional data at inference time. In this paper, we build upon kNN-LM (CITATION), which uses a pre-trained language model together with an exhaustive kNN search through the training data (memory bank) to achieve state-of-the-art results. We investigate whether we can improve the kNN-LM performance by instead training a LM with the knowledge that we will be using a kNN post-hoc. We achieved significant improvement using our method on language modeling tasks on WIKI-2 and WIKI-103. The main phenomenon that we encounter is that adding a simple L2 regularization on the activations (not weights) of the model, a transformer, improves the post-hoc kNN classification performance. We explore some possible reasons for this improvement. In particular, we find that the added L2 regularization seems to improve the performance for high-frequency words without deteriorating the performance for low frequency ones.
%R 10.18653/v1/2022.naacl-srw.4
%U https://aclanthology.org/2022.naacl-srw.4
%U https://doi.org/10.18653/v1/2022.naacl-srw.4
%P 25-30
Markdown (Informal)
[Regularized Training of Nearest Neighbor Language Models](https://aclanthology.org/2022.naacl-srw.4) (Ton et al., NAACL 2022)
ACL
- Jean-Francois Ton, Walter Talbott, Shuangfei Zhai, and Joshua M. Susskind. 2022. Regularized Training of Nearest Neighbor Language Models. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop, pages 25–30, Hybrid: Seattle, Washington + Online. Association for Computational Linguistics.