@inproceedings{banerjee-etal-2025-benchmarking,
title = "Benchmarking {H}indi Term Extraction in Education: A Dataset and Analysis",
author = "Banerjee, Shubhanker and
Chakravarthi, Bharathi Raja and
McCrae, John Philip",
editor = "Alam, Mehwish and
Tchechmedjiev, Andon and
Gracia, Jorge and
Gromann, Dagmar and
di Buono, Maria Pia and
Monti, Johanna and
Ionov, Maxim",
booktitle = "Proceedings of the 5th Conference on Language, Data and Knowledge",
month = sep,
year = "2025",
address = "Naples, Italy",
publisher = "Unior Press",
url = "https://aclanthology.org/2025.ldk-1.3/",
pages = "19--30",
ISBN = "978-88-6719-333-2",
abstract = "This paper introduces the HTEC HindiTerm Extraction Dataset 2.0, a resourcedesigned to support terminology extractionand classification tasks within the education domain. HTEC 2.0 has been developed with the objective of providing a high-quality benchmark dataset for the evaluation of term recognition and classification methodologies in Hindi educationaldiscourse. The dataset consists of 97 documents sourced from Hindi Wikipedia, covering a diverse range of topics relevant tothe education sector. Within these documents, 1,702 terms have been manuallyannotated where each term is defined as asingle-word or multi-word expression thatconveys a domain-specific meaning. Theannotated terms in HTEC 2.0 are systematically categorized into seven distinct classes.Furthermore, this paper outlines the development of annotation guidelines, detailingthe criteria used to determine term boundaries and category assignments. By offeringa structured dataset with clearly definedterm classifications, HTEC 2.0 serves as avaluable resource for researchers workingon terminology extraction, domain-specificnamed entity recognition, and text classification in Hindi."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="banerjee-etal-2025-benchmarking">
<titleInfo>
<title>Benchmarking Hindi Term Extraction in Education: A Dataset and Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shubhanker</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">Philip</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Conference on Language, Data and Knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehwish</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andon</namePart>
<namePart type="family">Tchechmedjiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Gracia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dagmar</namePart>
<namePart type="family">Gromann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Pia</namePart>
<namePart type="family">di Buono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Monti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Unior Press</publisher>
<place>
<placeTerm type="text">Naples, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-88-6719-333-2</identifier>
</relatedItem>
<abstract>This paper introduces the HTEC HindiTerm Extraction Dataset 2.0, a resourcedesigned to support terminology extractionand classification tasks within the education domain. HTEC 2.0 has been developed with the objective of providing a high-quality benchmark dataset for the evaluation of term recognition and classification methodologies in Hindi educationaldiscourse. The dataset consists of 97 documents sourced from Hindi Wikipedia, covering a diverse range of topics relevant tothe education sector. Within these documents, 1,702 terms have been manuallyannotated where each term is defined as asingle-word or multi-word expression thatconveys a domain-specific meaning. Theannotated terms in HTEC 2.0 are systematically categorized into seven distinct classes.Furthermore, this paper outlines the development of annotation guidelines, detailingthe criteria used to determine term boundaries and category assignments. By offeringa structured dataset with clearly definedterm classifications, HTEC 2.0 serves as avaluable resource for researchers workingon terminology extraction, domain-specificnamed entity recognition, and text classification in Hindi.</abstract>
<identifier type="citekey">banerjee-etal-2025-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2025.ldk-1.3/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>19</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Hindi Term Extraction in Education: A Dataset and Analysis
%A Banerjee, Shubhanker
%A Chakravarthi, Bharathi Raja
%A McCrae, John Philip
%Y Alam, Mehwish
%Y Tchechmedjiev, Andon
%Y Gracia, Jorge
%Y Gromann, Dagmar
%Y di Buono, Maria Pia
%Y Monti, Johanna
%Y Ionov, Maxim
%S Proceedings of the 5th Conference on Language, Data and Knowledge
%D 2025
%8 September
%I Unior Press
%C Naples, Italy
%@ 978-88-6719-333-2
%F banerjee-etal-2025-benchmarking
%X This paper introduces the HTEC HindiTerm Extraction Dataset 2.0, a resourcedesigned to support terminology extractionand classification tasks within the education domain. HTEC 2.0 has been developed with the objective of providing a high-quality benchmark dataset for the evaluation of term recognition and classification methodologies in Hindi educationaldiscourse. The dataset consists of 97 documents sourced from Hindi Wikipedia, covering a diverse range of topics relevant tothe education sector. Within these documents, 1,702 terms have been manuallyannotated where each term is defined as asingle-word or multi-word expression thatconveys a domain-specific meaning. Theannotated terms in HTEC 2.0 are systematically categorized into seven distinct classes.Furthermore, this paper outlines the development of annotation guidelines, detailingthe criteria used to determine term boundaries and category assignments. By offeringa structured dataset with clearly definedterm classifications, HTEC 2.0 serves as avaluable resource for researchers workingon terminology extraction, domain-specificnamed entity recognition, and text classification in Hindi.
%U https://aclanthology.org/2025.ldk-1.3/
%P 19-30
Markdown (Informal)
[Benchmarking Hindi Term Extraction in Education: A Dataset and Analysis](https://aclanthology.org/2025.ldk-1.3/) (Banerjee et al., LDK 2025)
ACL