@inproceedings{nityasya-etal-2023-scientific,
title = "On {``}Scientific Debt{''} in {NLP}: A Case for More Rigour in Language Model Pre-Training Research",
author = "Nityasya, Made Nindyatama and
Wibowo, Haryo and
Aji, Alham Fikri and
Winata, Genta and
Prasojo, Radityo Eko and
Blunsom, Phil and
Kuncoro, Adhiguna",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.477",
doi = "10.18653/v1/2023.acl-long.477",
pages = "8554--8572",
abstract = "This evidence-based position paper critiques current research practices within the language model pre-training literature. Despite rapid recent progress afforded by increasingly better pre-trained language models (PLMs), current PLM research practices often conflate different possible sources of model improvement, without conducting proper ablation studies and principled comparisons between different models under comparable conditions. These practices (i) leave us ill-equipped to understand which pre-training approaches should be used under what circumstances; (ii) impede reproducibility and credit assignment; and (iii) render it difficult to understand: {``}How exactly does each factor contribute to the progress that we have today?{''} We provide a case in point by revisiting the success of BERT over its baselines, ELMo and GPT-1, and demonstrate how {---} under comparable conditions where the baselines are tuned to a similar extent {---} these baselines (and even-simpler variants thereof) can, in fact, achieve competitive or better performance than BERT. These findings demonstrate how disentangling different factors of model improvements can lead to valuable new insights. We conclude with recommendations for how to encourage and incentivize this line of work, and accelerate progress towards a better and more systematic understanding of what factors drive the progress of our foundation models today.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nityasya-etal-2023-scientific">
<titleInfo>
<title>On “Scientific Debt” in NLP: A Case for More Rigour in Language Model Pre-Training Research</title>
</titleInfo>
<name type="personal">
<namePart type="given">Made</namePart>
<namePart type="given">Nindyatama</namePart>
<namePart type="family">Nityasya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haryo</namePart>
<namePart type="family">Wibowo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radityo</namePart>
<namePart type="given">Eko</namePart>
<namePart type="family">Prasojo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phil</namePart>
<namePart type="family">Blunsom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adhiguna</namePart>
<namePart type="family">Kuncoro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This evidence-based position paper critiques current research practices within the language model pre-training literature. Despite rapid recent progress afforded by increasingly better pre-trained language models (PLMs), current PLM research practices often conflate different possible sources of model improvement, without conducting proper ablation studies and principled comparisons between different models under comparable conditions. These practices (i) leave us ill-equipped to understand which pre-training approaches should be used under what circumstances; (ii) impede reproducibility and credit assignment; and (iii) render it difficult to understand: “How exactly does each factor contribute to the progress that we have today?” We provide a case in point by revisiting the success of BERT over its baselines, ELMo and GPT-1, and demonstrate how — under comparable conditions where the baselines are tuned to a similar extent — these baselines (and even-simpler variants thereof) can, in fact, achieve competitive or better performance than BERT. These findings demonstrate how disentangling different factors of model improvements can lead to valuable new insights. We conclude with recommendations for how to encourage and incentivize this line of work, and accelerate progress towards a better and more systematic understanding of what factors drive the progress of our foundation models today.</abstract>
<identifier type="citekey">nityasya-etal-2023-scientific</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.477</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.477</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>8554</start>
<end>8572</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On “Scientific Debt” in NLP: A Case for More Rigour in Language Model Pre-Training Research
%A Nityasya, Made Nindyatama
%A Wibowo, Haryo
%A Aji, Alham Fikri
%A Winata, Genta
%A Prasojo, Radityo Eko
%A Blunsom, Phil
%A Kuncoro, Adhiguna
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F nityasya-etal-2023-scientific
%X This evidence-based position paper critiques current research practices within the language model pre-training literature. Despite rapid recent progress afforded by increasingly better pre-trained language models (PLMs), current PLM research practices often conflate different possible sources of model improvement, without conducting proper ablation studies and principled comparisons between different models under comparable conditions. These practices (i) leave us ill-equipped to understand which pre-training approaches should be used under what circumstances; (ii) impede reproducibility and credit assignment; and (iii) render it difficult to understand: “How exactly does each factor contribute to the progress that we have today?” We provide a case in point by revisiting the success of BERT over its baselines, ELMo and GPT-1, and demonstrate how — under comparable conditions where the baselines are tuned to a similar extent — these baselines (and even-simpler variants thereof) can, in fact, achieve competitive or better performance than BERT. These findings demonstrate how disentangling different factors of model improvements can lead to valuable new insights. We conclude with recommendations for how to encourage and incentivize this line of work, and accelerate progress towards a better and more systematic understanding of what factors drive the progress of our foundation models today.
%R 10.18653/v1/2023.acl-long.477
%U https://aclanthology.org/2023.acl-long.477
%U https://doi.org/10.18653/v1/2023.acl-long.477
%P 8554-8572
Markdown (Informal)
[On “Scientific Debt” in NLP: A Case for More Rigour in Language Model Pre-Training Research](https://aclanthology.org/2023.acl-long.477) (Nityasya et al., ACL 2023)
ACL