@inproceedings{bai-etal-2021-pre,
title = "Pre-train or Annotate? Domain Adaptation with a Constrained Budget",
author = "Bai, Fan and
Ritter, Alan and
Xu, Wei",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.409",
doi = "10.18653/v1/2021.emnlp-main.409",
pages = "5002--5015",
abstract = "Recent work has demonstrated that pre-training in-domain language models can boost performance when adapting to a new domain. However, the costs associated with pre-training raise an important question: given a fixed budget, what steps should an NLP practitioner take to maximize performance? In this paper, we study domain adaptation under budget constraints, and approach it as a customer choice problem between data annotation and pre-training. Specifically, we measure the annotation cost of three procedural text datasets and the pre-training cost of three in-domain language models. Then we evaluate the utility of different combinations of pre-training and data annotation under varying budget constraints to assess which combination strategy works best. We find that, for small budgets, spending all funds on annotation leads to the best performance; once the budget becomes large enough, a combination of data annotation and in-domain pre-training works more optimally. We therefore suggest that task-specific data annotation should be part of an economical strategy when adapting an NLP model to a new domain.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bai-etal-2021-pre">
<titleInfo>
<title>Pre-train or Annotate? Domain Adaptation with a Constrained Budget</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fan</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent work has demonstrated that pre-training in-domain language models can boost performance when adapting to a new domain. However, the costs associated with pre-training raise an important question: given a fixed budget, what steps should an NLP practitioner take to maximize performance? In this paper, we study domain adaptation under budget constraints, and approach it as a customer choice problem between data annotation and pre-training. Specifically, we measure the annotation cost of three procedural text datasets and the pre-training cost of three in-domain language models. Then we evaluate the utility of different combinations of pre-training and data annotation under varying budget constraints to assess which combination strategy works best. We find that, for small budgets, spending all funds on annotation leads to the best performance; once the budget becomes large enough, a combination of data annotation and in-domain pre-training works more optimally. We therefore suggest that task-specific data annotation should be part of an economical strategy when adapting an NLP model to a new domain.</abstract>
<identifier type="citekey">bai-etal-2021-pre</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.409</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.409</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>5002</start>
<end>5015</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pre-train or Annotate? Domain Adaptation with a Constrained Budget
%A Bai, Fan
%A Ritter, Alan
%A Xu, Wei
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F bai-etal-2021-pre
%X Recent work has demonstrated that pre-training in-domain language models can boost performance when adapting to a new domain. However, the costs associated with pre-training raise an important question: given a fixed budget, what steps should an NLP practitioner take to maximize performance? In this paper, we study domain adaptation under budget constraints, and approach it as a customer choice problem between data annotation and pre-training. Specifically, we measure the annotation cost of three procedural text datasets and the pre-training cost of three in-domain language models. Then we evaluate the utility of different combinations of pre-training and data annotation under varying budget constraints to assess which combination strategy works best. We find that, for small budgets, spending all funds on annotation leads to the best performance; once the budget becomes large enough, a combination of data annotation and in-domain pre-training works more optimally. We therefore suggest that task-specific data annotation should be part of an economical strategy when adapting an NLP model to a new domain.
%R 10.18653/v1/2021.emnlp-main.409
%U https://aclanthology.org/2021.emnlp-main.409
%U https://doi.org/10.18653/v1/2021.emnlp-main.409
%P 5002-5015
Markdown (Informal)
[Pre-train or Annotate? Domain Adaptation with a Constrained Budget](https://aclanthology.org/2021.emnlp-main.409) (Bai et al., EMNLP 2021)
ACL