@inproceedings{brum-etal-2024-unsupervised,
title = "Unsupervised Grouping of Public Procurement Similar Items: Which Text Representation Should {I} Use?",
author = "Brum, Pedro P. V. and
Silva, Mariana O. and
Oliveira, Gabriel P. and
Costa, Lucas G. L. and
Lacerda, Anisio and
Pappa, Gisele",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1492",
pages = "17176--17185",
abstract = "In public procurement, establishing reference prices is essential to guide competitors in setting product prices. Group-purchased products, which are not standardized by default, are necessary to estimate reference prices. Text clustering techniques can be used to group similar items based on their descriptions, enabling the definition of reference prices for specific products or services. However, selecting an appropriate representation for text is challenging. This paper introduces a framework for text cleaning, extraction, and representation. We test eight distinct sentence representations tailored for public procurement item descriptions. Among these representations, we propose an approach that captures the most important components of item descriptions. Through extensive evaluation of a dataset comprising over 2 million items, our findings show that using sophisticated supervised methods to derive vectors for unsupervised tasks offers little advantages over leveraging unsupervised methods. Our results also highlight that domain-specific contextual knowledge is crucial for representation improvement.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brum-etal-2024-unsupervised">
<titleInfo>
<title>Unsupervised Grouping of Public Procurement Similar Items: Which Text Representation Should I Use?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="given">P</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Brum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="given">O</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Oliveira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="given">G</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Costa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anisio</namePart>
<namePart type="family">Lacerda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gisele</namePart>
<namePart type="family">Pappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In public procurement, establishing reference prices is essential to guide competitors in setting product prices. Group-purchased products, which are not standardized by default, are necessary to estimate reference prices. Text clustering techniques can be used to group similar items based on their descriptions, enabling the definition of reference prices for specific products or services. However, selecting an appropriate representation for text is challenging. This paper introduces a framework for text cleaning, extraction, and representation. We test eight distinct sentence representations tailored for public procurement item descriptions. Among these representations, we propose an approach that captures the most important components of item descriptions. Through extensive evaluation of a dataset comprising over 2 million items, our findings show that using sophisticated supervised methods to derive vectors for unsupervised tasks offers little advantages over leveraging unsupervised methods. Our results also highlight that domain-specific contextual knowledge is crucial for representation improvement.</abstract>
<identifier type="citekey">brum-etal-2024-unsupervised</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1492</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>17176</start>
<end>17185</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Grouping of Public Procurement Similar Items: Which Text Representation Should I Use?
%A Brum, Pedro P. V.
%A Silva, Mariana O.
%A Oliveira, Gabriel P.
%A Costa, Lucas G. L.
%A Lacerda, Anisio
%A Pappa, Gisele
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F brum-etal-2024-unsupervised
%X In public procurement, establishing reference prices is essential to guide competitors in setting product prices. Group-purchased products, which are not standardized by default, are necessary to estimate reference prices. Text clustering techniques can be used to group similar items based on their descriptions, enabling the definition of reference prices for specific products or services. However, selecting an appropriate representation for text is challenging. This paper introduces a framework for text cleaning, extraction, and representation. We test eight distinct sentence representations tailored for public procurement item descriptions. Among these representations, we propose an approach that captures the most important components of item descriptions. Through extensive evaluation of a dataset comprising over 2 million items, our findings show that using sophisticated supervised methods to derive vectors for unsupervised tasks offers little advantages over leveraging unsupervised methods. Our results also highlight that domain-specific contextual knowledge is crucial for representation improvement.
%U https://aclanthology.org/2024.lrec-main.1492
%P 17176-17185
Markdown (Informal)
[Unsupervised Grouping of Public Procurement Similar Items: Which Text Representation Should I Use?](https://aclanthology.org/2024.lrec-main.1492) (Brum et al., LREC-COLING 2024)
ACL