@inproceedings{alva-principe-etal-2024-lcf,
title = "An {LCF}-{IDF} Document Representation Model Applied to Long Document Classification",
author = "Alva Principe, Renzo Arturo and
Chiarini, Nicola and
Viviani, Marco",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.101",
pages = "1129--1135",
abstract = "A document representation model that has been used for years in NLP and Text Mining tasks is TF-IDF (Term Frequency-Inverse Document Frequency). This model is indeed effective for various tasks like Information Retrieval and Document Classification. However, it may fall short when it comes to capturing the deeper semantic and contextual meaning of a text, which is where Transformer-based Pre-trained Language Models (PLMs) such as BERT have been gaining significant traction in recent years. Despite this, these models also face specific challenges related to Transformers and their attention mechanism limits, especially when dealing with long documents. Therefore, this paper proposes a novel approach to exploit the advantages of the TF-IDF representation while incorporating semantic context, by introducing a Latent Concept Frequency-Inverse Document Frequency (LCF-IDF) document representation model. Its effectiveness is tested with respect to the Long Document Classification task. The results obtained show promising performance of the proposed solution compared to TF-IDF and BERT-like representation models, including those specifically for long documents such as Longformer as well as those designed for particular domains, especially when it comes to Single Label Multi-Class (SLMC) classification.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alva-principe-etal-2024-lcf">
<titleInfo>
<title>An LCF-IDF Document Representation Model Applied to Long Document Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Renzo</namePart>
<namePart type="given">Arturo</namePart>
<namePart type="family">Alva Principe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicola</namePart>
<namePart type="family">Chiarini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Viviani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A document representation model that has been used for years in NLP and Text Mining tasks is TF-IDF (Term Frequency-Inverse Document Frequency). This model is indeed effective for various tasks like Information Retrieval and Document Classification. However, it may fall short when it comes to capturing the deeper semantic and contextual meaning of a text, which is where Transformer-based Pre-trained Language Models (PLMs) such as BERT have been gaining significant traction in recent years. Despite this, these models also face specific challenges related to Transformers and their attention mechanism limits, especially when dealing with long documents. Therefore, this paper proposes a novel approach to exploit the advantages of the TF-IDF representation while incorporating semantic context, by introducing a Latent Concept Frequency-Inverse Document Frequency (LCF-IDF) document representation model. Its effectiveness is tested with respect to the Long Document Classification task. The results obtained show promising performance of the proposed solution compared to TF-IDF and BERT-like representation models, including those specifically for long documents such as Longformer as well as those designed for particular domains, especially when it comes to Single Label Multi-Class (SLMC) classification.</abstract>
<identifier type="citekey">alva-principe-etal-2024-lcf</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.101</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1129</start>
<end>1135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An LCF-IDF Document Representation Model Applied to Long Document Classification
%A Alva Principe, Renzo Arturo
%A Chiarini, Nicola
%A Viviani, Marco
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F alva-principe-etal-2024-lcf
%X A document representation model that has been used for years in NLP and Text Mining tasks is TF-IDF (Term Frequency-Inverse Document Frequency). This model is indeed effective for various tasks like Information Retrieval and Document Classification. However, it may fall short when it comes to capturing the deeper semantic and contextual meaning of a text, which is where Transformer-based Pre-trained Language Models (PLMs) such as BERT have been gaining significant traction in recent years. Despite this, these models also face specific challenges related to Transformers and their attention mechanism limits, especially when dealing with long documents. Therefore, this paper proposes a novel approach to exploit the advantages of the TF-IDF representation while incorporating semantic context, by introducing a Latent Concept Frequency-Inverse Document Frequency (LCF-IDF) document representation model. Its effectiveness is tested with respect to the Long Document Classification task. The results obtained show promising performance of the proposed solution compared to TF-IDF and BERT-like representation models, including those specifically for long documents such as Longformer as well as those designed for particular domains, especially when it comes to Single Label Multi-Class (SLMC) classification.
%U https://aclanthology.org/2024.lrec-main.101
%P 1129-1135
Markdown (Informal)
[An LCF-IDF Document Representation Model Applied to Long Document Classification](https://aclanthology.org/2024.lrec-main.101) (Alva Principe et al., LREC-COLING 2024)
ACL