@inproceedings{fusco-etal-2022-unsupervised,
title = "Unsupervised Term Extraction for Highly Technical Domains",
author = "Fusco, Francesco and
Staar, Peter and
Antognini, Diego",
editor = "Li, Yunyao and
Lazaridou, Angeliki",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-industry.1",
doi = "10.18653/v1/2022.emnlp-industry.1",
pages = "1--8",
abstract = "Term extraction is an information extraction task at the root of knowledge discovery platforms. Developing term extractors that are able to generalize across very diverse and potentially highly technical domains is challenging, as annotations for domains requiring in-depth expertise are scarce and expensive to obtain. In this paper, we describe the term extraction subsystem of a commercial knowledge discovery platform that targets highly technical fields such as pharma, medical, and material science. To be able to generalize across domains, we introduce a fully unsupervised annotator (UA). It extracts terms by combining novel morphological signals from sub-word tokenization with term-to-topic and intra-term similarity metrics, computed using general-domain pre-trained sentence-encoders. The annotator is used to implement a weakly-supervised setup, where transformer-models are fine-tuned (or pre-trained) over the training data generated by running the UA over large unlabeled corpora. Our experiments demonstrate that our setup can improve the predictive performance while decreasing the inference latency on both CPUs and GPUs. Our annotators provide a very competitive baseline for all the cases where annotations are not available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fusco-etal-2022-unsupervised">
<titleInfo>
<title>Unsupervised Term Extraction for Highly Technical Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Fusco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Staar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Antognini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angeliki</namePart>
<namePart type="family">Lazaridou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Term extraction is an information extraction task at the root of knowledge discovery platforms. Developing term extractors that are able to generalize across very diverse and potentially highly technical domains is challenging, as annotations for domains requiring in-depth expertise are scarce and expensive to obtain. In this paper, we describe the term extraction subsystem of a commercial knowledge discovery platform that targets highly technical fields such as pharma, medical, and material science. To be able to generalize across domains, we introduce a fully unsupervised annotator (UA). It extracts terms by combining novel morphological signals from sub-word tokenization with term-to-topic and intra-term similarity metrics, computed using general-domain pre-trained sentence-encoders. The annotator is used to implement a weakly-supervised setup, where transformer-models are fine-tuned (or pre-trained) over the training data generated by running the UA over large unlabeled corpora. Our experiments demonstrate that our setup can improve the predictive performance while decreasing the inference latency on both CPUs and GPUs. Our annotators provide a very competitive baseline for all the cases where annotations are not available.</abstract>
<identifier type="citekey">fusco-etal-2022-unsupervised</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-industry.1</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-industry.1</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1</start>
<end>8</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Term Extraction for Highly Technical Domains
%A Fusco, Francesco
%A Staar, Peter
%A Antognini, Diego
%Y Li, Yunyao
%Y Lazaridou, Angeliki
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F fusco-etal-2022-unsupervised
%X Term extraction is an information extraction task at the root of knowledge discovery platforms. Developing term extractors that are able to generalize across very diverse and potentially highly technical domains is challenging, as annotations for domains requiring in-depth expertise are scarce and expensive to obtain. In this paper, we describe the term extraction subsystem of a commercial knowledge discovery platform that targets highly technical fields such as pharma, medical, and material science. To be able to generalize across domains, we introduce a fully unsupervised annotator (UA). It extracts terms by combining novel morphological signals from sub-word tokenization with term-to-topic and intra-term similarity metrics, computed using general-domain pre-trained sentence-encoders. The annotator is used to implement a weakly-supervised setup, where transformer-models are fine-tuned (or pre-trained) over the training data generated by running the UA over large unlabeled corpora. Our experiments demonstrate that our setup can improve the predictive performance while decreasing the inference latency on both CPUs and GPUs. Our annotators provide a very competitive baseline for all the cases where annotations are not available.
%R 10.18653/v1/2022.emnlp-industry.1
%U https://aclanthology.org/2022.emnlp-industry.1
%U https://doi.org/10.18653/v1/2022.emnlp-industry.1
%P 1-8
Markdown (Informal)
[Unsupervised Term Extraction for Highly Technical Domains](https://aclanthology.org/2022.emnlp-industry.1) (Fusco et al., EMNLP 2022)
ACL
- Francesco Fusco, Peter Staar, and Diego Antognini. 2022. Unsupervised Term Extraction for Highly Technical Domains. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1–8, Abu Dhabi, UAE. Association for Computational Linguistics.