@inproceedings{renduchintala-etal-2023-ingenious,
title = "{INGENIOUS}: Using Informative Data Subsets for Efficient Pre-Training of Language Models",
author = "Renduchintala, H S V N S Kowndinya and
Killamsetty, Krishnateja and
Bhatia, Sumit and
Aggarwal, Milan and
Ramakrishnan, Ganesh and
Iyer, Rishabh and
Krishnamurthy, Balaji",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.445",
doi = "10.18653/v1/2023.findings-emnlp.445",
pages = "6690--6705",
abstract = "A salient characteristic of pre-trained language models (PTLMs) is a remarkable improvement in their generalization capability and emergence of new capabilities with increasing model capacity and pre-training dataset size. Consequently, we are witnessing the development of enormous models pushing the state-of-the-art. It is, however, imperative to realize that this inevitably leads to prohibitively long training times, extortionate computing costs, and a detrimental environmental impact. Significant efforts are underway to make PTLM training more efficient through innovations in model architectures, training pipelines, and loss function design, with scant attention being paid to optimizing the utility of training data. The key question that we ask is whether it is possible to train PTLMs by employing only highly informative subsets of the training data while maintaining downstream performance? Building upon the recent progress in informative data subset selection, we show how we can employ submodular optimization to select highly representative subsets of the training corpora and demonstrate that the proposed framework can be applied to efficiently train multiple PTLMs (BERT, BioBERT, GPT-2) using only a fraction of data. Further, we perform a rigorous empirical evaluation to show that the resulting models achieve up to {\textasciitilde}99{\%} of the performance of the fully-trained models. We made our framework publicly available at \url{https://github.com/Efficient-AI/ingenious}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="renduchintala-etal-2023-ingenious">
<titleInfo>
<title>INGENIOUS: Using Informative Data Subsets for Efficient Pre-Training of Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">S</namePart>
<namePart type="given">V</namePart>
<namePart type="given">N</namePart>
<namePart type="given">S</namePart>
<namePart type="given">Kowndinya</namePart>
<namePart type="family">Renduchintala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishnateja</namePart>
<namePart type="family">Killamsetty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumit</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milan</namePart>
<namePart type="family">Aggarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishabh</namePart>
<namePart type="family">Iyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balaji</namePart>
<namePart type="family">Krishnamurthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A salient characteristic of pre-trained language models (PTLMs) is a remarkable improvement in their generalization capability and emergence of new capabilities with increasing model capacity and pre-training dataset size. Consequently, we are witnessing the development of enormous models pushing the state-of-the-art. It is, however, imperative to realize that this inevitably leads to prohibitively long training times, extortionate computing costs, and a detrimental environmental impact. Significant efforts are underway to make PTLM training more efficient through innovations in model architectures, training pipelines, and loss function design, with scant attention being paid to optimizing the utility of training data. The key question that we ask is whether it is possible to train PTLMs by employing only highly informative subsets of the training data while maintaining downstream performance? Building upon the recent progress in informative data subset selection, we show how we can employ submodular optimization to select highly representative subsets of the training corpora and demonstrate that the proposed framework can be applied to efficiently train multiple PTLMs (BERT, BioBERT, GPT-2) using only a fraction of data. Further, we perform a rigorous empirical evaluation to show that the resulting models achieve up to ~99% of the performance of the fully-trained models. We made our framework publicly available at https://github.com/Efficient-AI/ingenious.</abstract>
<identifier type="citekey">renduchintala-etal-2023-ingenious</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.445</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.445</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>6690</start>
<end>6705</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T INGENIOUS: Using Informative Data Subsets for Efficient Pre-Training of Language Models
%A Renduchintala, H. S. V. N. S. Kowndinya
%A Killamsetty, Krishnateja
%A Bhatia, Sumit
%A Aggarwal, Milan
%A Ramakrishnan, Ganesh
%A Iyer, Rishabh
%A Krishnamurthy, Balaji
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F renduchintala-etal-2023-ingenious
%X A salient characteristic of pre-trained language models (PTLMs) is a remarkable improvement in their generalization capability and emergence of new capabilities with increasing model capacity and pre-training dataset size. Consequently, we are witnessing the development of enormous models pushing the state-of-the-art. It is, however, imperative to realize that this inevitably leads to prohibitively long training times, extortionate computing costs, and a detrimental environmental impact. Significant efforts are underway to make PTLM training more efficient through innovations in model architectures, training pipelines, and loss function design, with scant attention being paid to optimizing the utility of training data. The key question that we ask is whether it is possible to train PTLMs by employing only highly informative subsets of the training data while maintaining downstream performance? Building upon the recent progress in informative data subset selection, we show how we can employ submodular optimization to select highly representative subsets of the training corpora and demonstrate that the proposed framework can be applied to efficiently train multiple PTLMs (BERT, BioBERT, GPT-2) using only a fraction of data. Further, we perform a rigorous empirical evaluation to show that the resulting models achieve up to ~99% of the performance of the fully-trained models. We made our framework publicly available at https://github.com/Efficient-AI/ingenious.
%R 10.18653/v1/2023.findings-emnlp.445
%U https://aclanthology.org/2023.findings-emnlp.445
%U https://doi.org/10.18653/v1/2023.findings-emnlp.445
%P 6690-6705
Markdown (Informal)
[INGENIOUS: Using Informative Data Subsets for Efficient Pre-Training of Language Models](https://aclanthology.org/2023.findings-emnlp.445) (Renduchintala et al., Findings 2023)
ACL