@inproceedings{yuan-etal-2023-cosines,
title = "{C}o{S}i{NES}: Contrastive {S}iamese Network for Entity Standardization",
author = "Yuan, Jiaqing and
Merler, Michele and
Choudhury, Mihir and
Pavuluri, Raju and
Singh, Munindar and
Vukovic, Maja",
editor = "Hruschka, Estevam and
Mitchell, Tom and
Rahman, Sajjadur and
Mladeni{\'c}, Dunja and
Grobelnik, Marko",
booktitle = "Proceedings of the First Workshop on Matching From Unstructured and Structured Data (MATCHING 2023)",
month = jul,
year = "2023",
address = "Toronto, ON, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.matching-1.9",
doi = "10.18653/v1/2023.matching-1.9",
pages = "109--119",
abstract = "Entity standardization maps noisy mentions from free-form text to standard entities in a knowledge base. The unique challenge of this task relative to other entity-related tasks is the lack of surrounding context and numerous variations in the surface form of the mentions, especially when it comes to generalization across domains where labeled data is scarce. Previous research mostly focuses on developing models either heavily relying on context, or dedicated solely to a specific domain. In contrast, we propose CoSiNES, a generic and adaptable framework with Contrastive Siamese Network for Entity Standardization that effectively adapts a pretrained language model to capture the syntax and semantics of the entities in a new domain. We construct a new dataset in the technology domain, which contains 640 technical stack entities and 6,412 mentions collected from industrial content management systems. We demonstrate that CoSiNES yields higher accuracy and faster runtime than baselines derived from leading methods in this domain. CoSiNES also achieves competitive performance in four standard datasets from the chemistry, medicine, and biomedical domains, demonstrating its cross-domain applicability. Code and data is available at \url{https://github.com/konveyor/tackle-container-advisor/tree/main/entity_standardizer/cosines}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2023-cosines">
<titleInfo>
<title>CoSiNES: Contrastive Siamese Network for Entity Standardization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiaqing</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michele</namePart>
<namePart type="family">Merler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihir</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raju</namePart>
<namePart type="family">Pavuluri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Munindar</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Vukovic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Matching From Unstructured and Structured Data (MATCHING 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Estevam</namePart>
<namePart type="family">Hruschka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Mitchell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sajjadur</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dunja</namePart>
<namePart type="family">Mladenić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, ON, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Entity standardization maps noisy mentions from free-form text to standard entities in a knowledge base. The unique challenge of this task relative to other entity-related tasks is the lack of surrounding context and numerous variations in the surface form of the mentions, especially when it comes to generalization across domains where labeled data is scarce. Previous research mostly focuses on developing models either heavily relying on context, or dedicated solely to a specific domain. In contrast, we propose CoSiNES, a generic and adaptable framework with Contrastive Siamese Network for Entity Standardization that effectively adapts a pretrained language model to capture the syntax and semantics of the entities in a new domain. We construct a new dataset in the technology domain, which contains 640 technical stack entities and 6,412 mentions collected from industrial content management systems. We demonstrate that CoSiNES yields higher accuracy and faster runtime than baselines derived from leading methods in this domain. CoSiNES also achieves competitive performance in four standard datasets from the chemistry, medicine, and biomedical domains, demonstrating its cross-domain applicability. Code and data is available at https://github.com/konveyor/tackle-container-advisor/tree/main/entity_standardizer/cosines</abstract>
<identifier type="citekey">yuan-etal-2023-cosines</identifier>
<identifier type="doi">10.18653/v1/2023.matching-1.9</identifier>
<location>
<url>https://aclanthology.org/2023.matching-1.9</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>109</start>
<end>119</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CoSiNES: Contrastive Siamese Network for Entity Standardization
%A Yuan, Jiaqing
%A Merler, Michele
%A Choudhury, Mihir
%A Pavuluri, Raju
%A Singh, Munindar
%A Vukovic, Maja
%Y Hruschka, Estevam
%Y Mitchell, Tom
%Y Rahman, Sajjadur
%Y Mladenić, Dunja
%Y Grobelnik, Marko
%S Proceedings of the First Workshop on Matching From Unstructured and Structured Data (MATCHING 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, ON, Canada
%F yuan-etal-2023-cosines
%X Entity standardization maps noisy mentions from free-form text to standard entities in a knowledge base. The unique challenge of this task relative to other entity-related tasks is the lack of surrounding context and numerous variations in the surface form of the mentions, especially when it comes to generalization across domains where labeled data is scarce. Previous research mostly focuses on developing models either heavily relying on context, or dedicated solely to a specific domain. In contrast, we propose CoSiNES, a generic and adaptable framework with Contrastive Siamese Network for Entity Standardization that effectively adapts a pretrained language model to capture the syntax and semantics of the entities in a new domain. We construct a new dataset in the technology domain, which contains 640 technical stack entities and 6,412 mentions collected from industrial content management systems. We demonstrate that CoSiNES yields higher accuracy and faster runtime than baselines derived from leading methods in this domain. CoSiNES also achieves competitive performance in four standard datasets from the chemistry, medicine, and biomedical domains, demonstrating its cross-domain applicability. Code and data is available at https://github.com/konveyor/tackle-container-advisor/tree/main/entity_standardizer/cosines
%R 10.18653/v1/2023.matching-1.9
%U https://aclanthology.org/2023.matching-1.9
%U https://doi.org/10.18653/v1/2023.matching-1.9
%P 109-119
Markdown (Informal)
[CoSiNES: Contrastive Siamese Network for Entity Standardization](https://aclanthology.org/2023.matching-1.9) (Yuan et al., MATCHING 2023)
ACL
- Jiaqing Yuan, Michele Merler, Mihir Choudhury, Raju Pavuluri, Munindar Singh, and Maja Vukovic. 2023. CoSiNES: Contrastive Siamese Network for Entity Standardization. In Proceedings of the First Workshop on Matching From Unstructured and Structured Data (MATCHING 2023), pages 109–119, Toronto, ON, Canada. Association for Computational Linguistics.