@inproceedings{ishigaki-etal-2023-pretraining,
title = "Pretraining Language- and Domain-Specific {BERT} on Automatically Translated Text",
author = "Ishigaki, Tatsuya and
Uehara, Yui and
Topi{\'c}, Goran and
Takamura, Hiroya",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.60",
pages = "548--555",
abstract = "Domain-specific pretrained language models such as SciBERT are effective for various tasks involving text in specific domains. However, pretraining BERT requires a large-scale language resource, which is not necessarily available in fine-grained domains, especially in non-English languages. In this study, we focus on a setting with no available domain-specific text for pretraining. To this end, we propose a simple framework that trains a BERT on text in the target language automatically translated from a resource-rich language, e.g., English. In this paper, we particularly focus on the materials science domain in Japanese. Our experiments pertain to the task of entity and relation extraction for this domain and language. The experiments demonstrate that the various models pretrained on translated texts consistently perform better than the general BERT in terms of F1 scores although the domain-specific BERTs do not use any human-authored domain-specific text. These results imply that BERTs for various low-resource domains can be successfully trained on texts automatically translated from resource-rich languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ishigaki-etal-2023-pretraining">
<titleInfo>
<title>Pretraining Language- and Domain-Specific BERT on Automatically Translated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Ishigaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yui</namePart>
<namePart type="family">Uehara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Topić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Domain-specific pretrained language models such as SciBERT are effective for various tasks involving text in specific domains. However, pretraining BERT requires a large-scale language resource, which is not necessarily available in fine-grained domains, especially in non-English languages. In this study, we focus on a setting with no available domain-specific text for pretraining. To this end, we propose a simple framework that trains a BERT on text in the target language automatically translated from a resource-rich language, e.g., English. In this paper, we particularly focus on the materials science domain in Japanese. Our experiments pertain to the task of entity and relation extraction for this domain and language. The experiments demonstrate that the various models pretrained on translated texts consistently perform better than the general BERT in terms of F1 scores although the domain-specific BERTs do not use any human-authored domain-specific text. These results imply that BERTs for various low-resource domains can be successfully trained on texts automatically translated from resource-rich languages.</abstract>
<identifier type="citekey">ishigaki-etal-2023-pretraining</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.60</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>548</start>
<end>555</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pretraining Language- and Domain-Specific BERT on Automatically Translated Text
%A Ishigaki, Tatsuya
%A Uehara, Yui
%A Topić, Goran
%A Takamura, Hiroya
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F ishigaki-etal-2023-pretraining
%X Domain-specific pretrained language models such as SciBERT are effective for various tasks involving text in specific domains. However, pretraining BERT requires a large-scale language resource, which is not necessarily available in fine-grained domains, especially in non-English languages. In this study, we focus on a setting with no available domain-specific text for pretraining. To this end, we propose a simple framework that trains a BERT on text in the target language automatically translated from a resource-rich language, e.g., English. In this paper, we particularly focus on the materials science domain in Japanese. Our experiments pertain to the task of entity and relation extraction for this domain and language. The experiments demonstrate that the various models pretrained on translated texts consistently perform better than the general BERT in terms of F1 scores although the domain-specific BERTs do not use any human-authored domain-specific text. These results imply that BERTs for various low-resource domains can be successfully trained on texts automatically translated from resource-rich languages.
%U https://aclanthology.org/2023.ranlp-1.60
%P 548-555
Markdown (Informal)
[Pretraining Language- and Domain-Specific BERT on Automatically Translated Text](https://aclanthology.org/2023.ranlp-1.60) (Ishigaki et al., RANLP 2023)
ACL