@inproceedings{nogueira-dos-santos-etal-2024-memory,
title = "Memory Augmented Language Models through Mixture of Word Experts",
author = "Nogueira dos Santos, Cicero and
Lee-Thorp, James and
Noble, Isaac and
Chang, Chung-Ching and
Uthus, David",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.249",
doi = "10.18653/v1/2024.naacl-long.249",
pages = "4425--4438",
abstract = "Scaling up the number of parameters of language models has proven to be an effective approach to improve performance. For dense models, increasing their size proportionally increases their computational footprint. In this work, we seek to aggressively decouple learning capacity and FLOPs through Mixture-of-Experts (MoE) style models with large knowledge-rich vocabulary based routing functions. Our proposed approach, dubbed Mixture of Word Experts (MoWE), can be seen as a memory augmented model, where a large set of word-specific experts play the role of a sparse memory. We demonstrate that MoWE performs significantly better than the T5 family of models with similar number of FLOPs in a variety of NLP tasks. Moreover, MoWE outperforms traditional MoE models on knowledge intensive tasks and has similar performance to complex memory augmented approaches that often require to invoke custom mechanisms to search the sparse memory.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nogueira-dos-santos-etal-2024-memory">
<titleInfo>
<title>Memory Augmented Language Models through Mixture of Word Experts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cicero</namePart>
<namePart type="family">Nogueira dos Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Lee-Thorp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Noble</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chung-Ching</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Uthus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Scaling up the number of parameters of language models has proven to be an effective approach to improve performance. For dense models, increasing their size proportionally increases their computational footprint. In this work, we seek to aggressively decouple learning capacity and FLOPs through Mixture-of-Experts (MoE) style models with large knowledge-rich vocabulary based routing functions. Our proposed approach, dubbed Mixture of Word Experts (MoWE), can be seen as a memory augmented model, where a large set of word-specific experts play the role of a sparse memory. We demonstrate that MoWE performs significantly better than the T5 family of models with similar number of FLOPs in a variety of NLP tasks. Moreover, MoWE outperforms traditional MoE models on knowledge intensive tasks and has similar performance to complex memory augmented approaches that often require to invoke custom mechanisms to search the sparse memory.</abstract>
<identifier type="citekey">nogueira-dos-santos-etal-2024-memory</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.249</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.249</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>4425</start>
<end>4438</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Memory Augmented Language Models through Mixture of Word Experts
%A Nogueira dos Santos, Cicero
%A Lee-Thorp, James
%A Noble, Isaac
%A Chang, Chung-Ching
%A Uthus, David
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nogueira-dos-santos-etal-2024-memory
%X Scaling up the number of parameters of language models has proven to be an effective approach to improve performance. For dense models, increasing their size proportionally increases their computational footprint. In this work, we seek to aggressively decouple learning capacity and FLOPs through Mixture-of-Experts (MoE) style models with large knowledge-rich vocabulary based routing functions. Our proposed approach, dubbed Mixture of Word Experts (MoWE), can be seen as a memory augmented model, where a large set of word-specific experts play the role of a sparse memory. We demonstrate that MoWE performs significantly better than the T5 family of models with similar number of FLOPs in a variety of NLP tasks. Moreover, MoWE outperforms traditional MoE models on knowledge intensive tasks and has similar performance to complex memory augmented approaches that often require to invoke custom mechanisms to search the sparse memory.
%R 10.18653/v1/2024.naacl-long.249
%U https://aclanthology.org/2024.naacl-long.249
%U https://doi.org/10.18653/v1/2024.naacl-long.249
%P 4425-4438
Markdown (Informal)
[Memory Augmented Language Models through Mixture of Word Experts](https://aclanthology.org/2024.naacl-long.249) (Nogueira dos Santos et al., NAACL 2024)
ACL
- Cicero Nogueira dos Santos, James Lee-Thorp, Isaac Noble, Chung-Ching Chang, and David Uthus. 2024. Memory Augmented Language Models through Mixture of Word Experts. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 4425–4438, Mexico City, Mexico. Association for Computational Linguistics.