@inproceedings{haj-yahia-etal-2019-towards,
title = "Towards Unsupervised Text Classification Leveraging Experts and Word Embeddings",
author = "Haj-Yahia, Zied and
Sieg, Adrien and
Deleris, L{\'e}a A.",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1036/",
doi = "10.18653/v1/P19-1036",
pages = "371--379",
abstract = "Text classification aims at mapping documents into a set of predefined categories. Supervised machine learning models have shown great success in this area but they require a large number of labeled documents to reach adequate accuracy. This is particularly true when the number of target categories is in the tens or the hundreds. In this work, we explore an unsupervised approach to classify documents into categories simply described by a label. The proposed method is inspired by the way a human proceeds in this situation: It draws on textual similarity between the most relevant words in each document and a dictionary of keywords for each category reflecting its semantics and lexical field. The novelty of our method hinges on the enrichment of the category labels through a combination of human expertise and language models, both generic and domain specific. Our experiments on 5 standard corpora show that the proposed method increases F1-score over relying solely on human expertise and can also be on par with simple supervised approaches. It thus provides a practical alternative to situations where low cost text categorization is needed, as we illustrate with our application to operational risk incidents classification."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haj-yahia-etal-2019-towards">
<titleInfo>
<title>Towards Unsupervised Text Classification Leveraging Experts and Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zied</namePart>
<namePart type="family">Haj-Yahia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Sieg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Léa</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Deleris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text classification aims at mapping documents into a set of predefined categories. Supervised machine learning models have shown great success in this area but they require a large number of labeled documents to reach adequate accuracy. This is particularly true when the number of target categories is in the tens or the hundreds. In this work, we explore an unsupervised approach to classify documents into categories simply described by a label. The proposed method is inspired by the way a human proceeds in this situation: It draws on textual similarity between the most relevant words in each document and a dictionary of keywords for each category reflecting its semantics and lexical field. The novelty of our method hinges on the enrichment of the category labels through a combination of human expertise and language models, both generic and domain specific. Our experiments on 5 standard corpora show that the proposed method increases F1-score over relying solely on human expertise and can also be on par with simple supervised approaches. It thus provides a practical alternative to situations where low cost text categorization is needed, as we illustrate with our application to operational risk incidents classification.</abstract>
<identifier type="citekey">haj-yahia-etal-2019-towards</identifier>
<identifier type="doi">10.18653/v1/P19-1036</identifier>
<location>
<url>https://aclanthology.org/P19-1036/</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>371</start>
<end>379</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Unsupervised Text Classification Leveraging Experts and Word Embeddings
%A Haj-Yahia, Zied
%A Sieg, Adrien
%A Deleris, Léa A.
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F haj-yahia-etal-2019-towards
%X Text classification aims at mapping documents into a set of predefined categories. Supervised machine learning models have shown great success in this area but they require a large number of labeled documents to reach adequate accuracy. This is particularly true when the number of target categories is in the tens or the hundreds. In this work, we explore an unsupervised approach to classify documents into categories simply described by a label. The proposed method is inspired by the way a human proceeds in this situation: It draws on textual similarity between the most relevant words in each document and a dictionary of keywords for each category reflecting its semantics and lexical field. The novelty of our method hinges on the enrichment of the category labels through a combination of human expertise and language models, both generic and domain specific. Our experiments on 5 standard corpora show that the proposed method increases F1-score over relying solely on human expertise and can also be on par with simple supervised approaches. It thus provides a practical alternative to situations where low cost text categorization is needed, as we illustrate with our application to operational risk incidents classification.
%R 10.18653/v1/P19-1036
%U https://aclanthology.org/P19-1036/
%U https://doi.org/10.18653/v1/P19-1036
%P 371-379
Markdown (Informal)
[Towards Unsupervised Text Classification Leveraging Experts and Word Embeddings](https://aclanthology.org/P19-1036/) (Haj-Yahia et al., ACL 2019)
ACL