@inproceedings{svete-cotterell-2024-transformers,
title = "Transformers Can Represent $n$-gram Language Models",
author = "Svete, Anej and
Cotterell, Ryan",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.381",
doi = "10.18653/v1/2024.naacl-long.381",
pages = "6845--6881",
abstract = "Plenty of existing work has analyzed the abilities of the transformer architecture by describing its representational capacity with formal models of computation. However, the focus so far has been on analyzing the architecture in terms of language \textit{acceptance}. We contend that this is an ill-suited problem in the study of \textit{language models} (LMs), which are definitionally \textit{probability distributions} over strings. In this paper, we focus on the relationship between transformer LMs and $n$-gram LMs, a simple and historically relevant class of language models. We show that transformer LMs using the hard or sparse attention mechanisms can exactly represent any $n$-gram LM, giving us a concrete lower bound on their probabilistic representational capacity. This provides a first step towards understanding the mechanisms that transformer LMs can use to represent probability distributions over strings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="svete-cotterell-2024-transformers">
<titleInfo>
<title>Transformers Can Represent n-gram Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anej</namePart>
<namePart type="family">Svete</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Plenty of existing work has analyzed the abilities of the transformer architecture by describing its representational capacity with formal models of computation. However, the focus so far has been on analyzing the architecture in terms of language acceptance. We contend that this is an ill-suited problem in the study of language models (LMs), which are definitionally probability distributions over strings. In this paper, we focus on the relationship between transformer LMs and n-gram LMs, a simple and historically relevant class of language models. We show that transformer LMs using the hard or sparse attention mechanisms can exactly represent any n-gram LM, giving us a concrete lower bound on their probabilistic representational capacity. This provides a first step towards understanding the mechanisms that transformer LMs can use to represent probability distributions over strings.</abstract>
<identifier type="citekey">svete-cotterell-2024-transformers</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.381</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.381</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>6845</start>
<end>6881</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transformers Can Represent n-gram Language Models
%A Svete, Anej
%A Cotterell, Ryan
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F svete-cotterell-2024-transformers
%X Plenty of existing work has analyzed the abilities of the transformer architecture by describing its representational capacity with formal models of computation. However, the focus so far has been on analyzing the architecture in terms of language acceptance. We contend that this is an ill-suited problem in the study of language models (LMs), which are definitionally probability distributions over strings. In this paper, we focus on the relationship between transformer LMs and n-gram LMs, a simple and historically relevant class of language models. We show that transformer LMs using the hard or sparse attention mechanisms can exactly represent any n-gram LM, giving us a concrete lower bound on their probabilistic representational capacity. This provides a first step towards understanding the mechanisms that transformer LMs can use to represent probability distributions over strings.
%R 10.18653/v1/2024.naacl-long.381
%U https://aclanthology.org/2024.naacl-long.381
%U https://doi.org/10.18653/v1/2024.naacl-long.381
%P 6845-6881
Markdown (Informal)
[Transformers Can Represent n-gram Language Models](https://aclanthology.org/2024.naacl-long.381) (Svete & Cotterell, NAACL 2024)
ACL
- Anej Svete and Ryan Cotterell. 2024. Transformers Can Represent n-gram Language Models. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 6845–6881, Mexico City, Mexico. Association for Computational Linguistics.