@inproceedings{banerjee-etal-2022-dataset,
title = "A Dataset for Term Extraction in {H}indi",
author = "Banerjee, Shubhanker and
Chakravarthi, Bharathi Raja and
McCrae, John Philip",
editor = "Costa, Rute and
Carvalho, Sara and
Ani{\'c}, Ana Ostro{\v{s}}ki and
Khan, Anas Fahad",
booktitle = "Proceedings of the Workshop on Terminology in the 21st century: many faces, many places",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.term-1.4",
pages = "19--25",
abstract = "Automatic Term Extraction (ATE) is one of the core problems in natural language processing and forms a key component of text mining pipelines of domain specific corpora. Complex low-level tasks such as machine translation and summarization for domain specific texts necessitate the use of term extraction systems. However, the development of these systems requires the use of large annotated datasets and thus there has been little progress made on this front for under-resourced languages. As a part of ongoing research, we present a dataset for term extraction from Hindi texts in this paper. To the best of our knowledge, this is the first dataset that provides term annotated documents for Hindi. Furthermore, we have evaluated this dataset on statistical term extraction methods and the results obtained indicate the problems associated with development of term extractors for under-resourced languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="banerjee-etal-2022-dataset">
<titleInfo>
<title>A Dataset for Term Extraction in Hindi</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shubhanker</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">Philip</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Terminology in the 21st century: many faces, many places</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rute</namePart>
<namePart type="family">Costa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Carvalho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="given">Ostroški</namePart>
<namePart type="family">Anić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anas</namePart>
<namePart type="given">Fahad</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Term Extraction (ATE) is one of the core problems in natural language processing and forms a key component of text mining pipelines of domain specific corpora. Complex low-level tasks such as machine translation and summarization for domain specific texts necessitate the use of term extraction systems. However, the development of these systems requires the use of large annotated datasets and thus there has been little progress made on this front for under-resourced languages. As a part of ongoing research, we present a dataset for term extraction from Hindi texts in this paper. To the best of our knowledge, this is the first dataset that provides term annotated documents for Hindi. Furthermore, we have evaluated this dataset on statistical term extraction methods and the results obtained indicate the problems associated with development of term extractors for under-resourced languages.</abstract>
<identifier type="citekey">banerjee-etal-2022-dataset</identifier>
<location>
<url>https://aclanthology.org/2022.term-1.4</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>19</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Dataset for Term Extraction in Hindi
%A Banerjee, Shubhanker
%A Chakravarthi, Bharathi Raja
%A McCrae, John Philip
%Y Costa, Rute
%Y Carvalho, Sara
%Y Anić, Ana Ostroški
%Y Khan, Anas Fahad
%S Proceedings of the Workshop on Terminology in the 21st century: many faces, many places
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F banerjee-etal-2022-dataset
%X Automatic Term Extraction (ATE) is one of the core problems in natural language processing and forms a key component of text mining pipelines of domain specific corpora. Complex low-level tasks such as machine translation and summarization for domain specific texts necessitate the use of term extraction systems. However, the development of these systems requires the use of large annotated datasets and thus there has been little progress made on this front for under-resourced languages. As a part of ongoing research, we present a dataset for term extraction from Hindi texts in this paper. To the best of our knowledge, this is the first dataset that provides term annotated documents for Hindi. Furthermore, we have evaluated this dataset on statistical term extraction methods and the results obtained indicate the problems associated with development of term extractors for under-resourced languages.
%U https://aclanthology.org/2022.term-1.4
%P 19-25
Markdown (Informal)
[A Dataset for Term Extraction in Hindi](https://aclanthology.org/2022.term-1.4) (Banerjee et al., TERM 2022)
ACL
- Shubhanker Banerjee, Bharathi Raja Chakravarthi, and John Philip McCrae. 2022. A Dataset for Term Extraction in Hindi. In Proceedings of the Workshop on Terminology in the 21st century: many faces, many places, pages 19–25, Marseille, France. European Language Resources Association.