@inproceedings{yoshida-etal-2024-maps,
title = "{MAP}`s not dead yet: Uncovering true language model modes by conditioning away degeneracy",
author = "Yoshida, Davis and
Goyal, Kartik and
Gimpel, Kevin",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.855/",
doi = "10.18653/v1/2024.acl-long.855",
pages = "16164--16215",
abstract = "It has been widely observed that exact or approximate MAP (mode-seeking) decoding from natural language generation (NLG) models consistently leads to degenerate outputs (Holtzman et al., 2019; Stahlberg and Byrne, 2019). Prior work has attributed this behavior to either a fundamental and unavoidable inadequacy of modes in probabilistic models or weaknesses in language modeling. Contrastingly, we argue that degenerate modes can even occur in the absence of any modeling error, due to contamination of the training data. Specifically, we argue that mixing even a tiny amount of low-entropy noise with a population text distribution can cause the data distribution`s mode to become degenerate. We therefore propose to apply MAP decoding to the model`s true conditional distribution where the conditioning variable explicitly avoids specific degenerate behavior. Using exact search, we empirically verify that the length-conditional modes of machine translation models and language models are indeed more fluent and topical than their unconditional modes. For the first time, we also share many examples of exact modal sequences from these models, and from several variants of the LLaMA-7B model. Notably, we observethat various kinds of degenerate modes persist, even at the scale of LLaMA-7B. Although we cannot tractably address these degeneracieswith exact search, we perform a classifier-based approximate search on LLaMA-7B, a model which was not trained for instruction following, and find that we are able to elicit reasonable outputs without any finetuning."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yoshida-etal-2024-maps">
<titleInfo>
<title>MAP‘s not dead yet: Uncovering true language model modes by conditioning away degeneracy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Davis</namePart>
<namePart type="family">Yoshida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kartik</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Gimpel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It has been widely observed that exact or approximate MAP (mode-seeking) decoding from natural language generation (NLG) models consistently leads to degenerate outputs (Holtzman et al., 2019; Stahlberg and Byrne, 2019). Prior work has attributed this behavior to either a fundamental and unavoidable inadequacy of modes in probabilistic models or weaknesses in language modeling. Contrastingly, we argue that degenerate modes can even occur in the absence of any modeling error, due to contamination of the training data. Specifically, we argue that mixing even a tiny amount of low-entropy noise with a population text distribution can cause the data distribution‘s mode to become degenerate. We therefore propose to apply MAP decoding to the model‘s true conditional distribution where the conditioning variable explicitly avoids specific degenerate behavior. Using exact search, we empirically verify that the length-conditional modes of machine translation models and language models are indeed more fluent and topical than their unconditional modes. For the first time, we also share many examples of exact modal sequences from these models, and from several variants of the LLaMA-7B model. Notably, we observethat various kinds of degenerate modes persist, even at the scale of LLaMA-7B. Although we cannot tractably address these degeneracieswith exact search, we perform a classifier-based approximate search on LLaMA-7B, a model which was not trained for instruction following, and find that we are able to elicit reasonable outputs without any finetuning.</abstract>
<identifier type="citekey">yoshida-etal-2024-maps</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.855</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.855/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>16164</start>
<end>16215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MAP‘s not dead yet: Uncovering true language model modes by conditioning away degeneracy
%A Yoshida, Davis
%A Goyal, Kartik
%A Gimpel, Kevin
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F yoshida-etal-2024-maps
%X It has been widely observed that exact or approximate MAP (mode-seeking) decoding from natural language generation (NLG) models consistently leads to degenerate outputs (Holtzman et al., 2019; Stahlberg and Byrne, 2019). Prior work has attributed this behavior to either a fundamental and unavoidable inadequacy of modes in probabilistic models or weaknesses in language modeling. Contrastingly, we argue that degenerate modes can even occur in the absence of any modeling error, due to contamination of the training data. Specifically, we argue that mixing even a tiny amount of low-entropy noise with a population text distribution can cause the data distribution‘s mode to become degenerate. We therefore propose to apply MAP decoding to the model‘s true conditional distribution where the conditioning variable explicitly avoids specific degenerate behavior. Using exact search, we empirically verify that the length-conditional modes of machine translation models and language models are indeed more fluent and topical than their unconditional modes. For the first time, we also share many examples of exact modal sequences from these models, and from several variants of the LLaMA-7B model. Notably, we observethat various kinds of degenerate modes persist, even at the scale of LLaMA-7B. Although we cannot tractably address these degeneracieswith exact search, we perform a classifier-based approximate search on LLaMA-7B, a model which was not trained for instruction following, and find that we are able to elicit reasonable outputs without any finetuning.
%R 10.18653/v1/2024.acl-long.855
%U https://aclanthology.org/2024.luhme-long.855/
%U https://doi.org/10.18653/v1/2024.acl-long.855
%P 16164-16215
Markdown (Informal)
[MAP’s not dead yet: Uncovering true language model modes by conditioning away degeneracy](https://aclanthology.org/2024.luhme-long.855/) (Yoshida et al., ACL 2024)
ACL