@inproceedings{yin-etal-2024-semformer,
title = "Semformer: Transformer Language Models with Semantic Planning",
author = "Yin, Yongjing and
Ding, Junran and
Song, Kai and
Zhang, Yue",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1039",
pages = "18669--18680",
abstract = "Next-token prediction serves as the dominant component in current neural language models.During the training phase, the model employs teacher forcing, which predicts tokens based on all preceding ground truth tokens.However, this approach has been found to create shortcuts, utilizing the revealed prefix to spuriously fit future tokens, potentially compromising the accuracy of the next-token predictor.In this paper, we introduce Semformer, a novel method of training a Transformer language model that explicitly models the semantic planning of response.Specifically, we incorporate a sequence of planning tokens into the prefix, guiding the planning token representations to predict the latent semantic representations of the response, which are induced by an autoencoder.In a minimal planning task (i.e., graph path-finding), our model exhibits near-perfect performance and effectively mitigates shortcut learning, a feat that standard training methods and baseline models have been unable to accomplish.Furthermore, we pretrain Semformer from scratch with 125M parameters, demonstrating its efficacy through measures of perplexity, in-context learning, and fine-tuning on summarization tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2024-semformer">
<titleInfo>
<title>Semformer: Transformer Language Models with Semantic Planning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongjing</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junran</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Next-token prediction serves as the dominant component in current neural language models.During the training phase, the model employs teacher forcing, which predicts tokens based on all preceding ground truth tokens.However, this approach has been found to create shortcuts, utilizing the revealed prefix to spuriously fit future tokens, potentially compromising the accuracy of the next-token predictor.In this paper, we introduce Semformer, a novel method of training a Transformer language model that explicitly models the semantic planning of response.Specifically, we incorporate a sequence of planning tokens into the prefix, guiding the planning token representations to predict the latent semantic representations of the response, which are induced by an autoencoder.In a minimal planning task (i.e., graph path-finding), our model exhibits near-perfect performance and effectively mitigates shortcut learning, a feat that standard training methods and baseline models have been unable to accomplish.Furthermore, we pretrain Semformer from scratch with 125M parameters, demonstrating its efficacy through measures of perplexity, in-context learning, and fine-tuning on summarization tasks.</abstract>
<identifier type="citekey">yin-etal-2024-semformer</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1039</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>18669</start>
<end>18680</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semformer: Transformer Language Models with Semantic Planning
%A Yin, Yongjing
%A Ding, Junran
%A Song, Kai
%A Zhang, Yue
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F yin-etal-2024-semformer
%X Next-token prediction serves as the dominant component in current neural language models.During the training phase, the model employs teacher forcing, which predicts tokens based on all preceding ground truth tokens.However, this approach has been found to create shortcuts, utilizing the revealed prefix to spuriously fit future tokens, potentially compromising the accuracy of the next-token predictor.In this paper, we introduce Semformer, a novel method of training a Transformer language model that explicitly models the semantic planning of response.Specifically, we incorporate a sequence of planning tokens into the prefix, guiding the planning token representations to predict the latent semantic representations of the response, which are induced by an autoencoder.In a minimal planning task (i.e., graph path-finding), our model exhibits near-perfect performance and effectively mitigates shortcut learning, a feat that standard training methods and baseline models have been unable to accomplish.Furthermore, we pretrain Semformer from scratch with 125M parameters, demonstrating its efficacy through measures of perplexity, in-context learning, and fine-tuning on summarization tasks.
%U https://aclanthology.org/2024.emnlp-main.1039
%P 18669-18680
Markdown (Informal)
[Semformer: Transformer Language Models with Semantic Planning](https://aclanthology.org/2024.emnlp-main.1039) (Yin et al., EMNLP 2024)
ACL