@inproceedings{zhou-etal-2022-simans,
title = "{S}im{ANS}: Simple Ambiguous Negatives Sampling for Dense Text Retrieval",
author = "Zhou, Kun and
Gong, Yeyun and
Liu, Xiao and
Zhao, Wayne Xin and
Shen, Yelong and
Dong, Anlei and
Lu, Jingwen and
Majumder, Rangan and
Wen, Ji-rong and
Duan, Nan",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-industry.56",
doi = "10.18653/v1/2022.emnlp-industry.56",
pages = "548--559",
abstract = "Sampling proper negatives from a large document pool is vital to effectively train a dense retrieval model. However, existing negative sampling strategies suffer from the uninformative or false negative problem. In this work, we empirically show that according to the measured relevance scores, the negatives ranked around the positives are generally more informative and less likely to be false negatives. Intuitively, these negatives are not too hard (\textit{may be false negatives}) or too easy (\textit{uninformative}). They are the ambiguous negatives and need more attention during training.Thus, we propose a simple ambiguous negatives sampling method, SimANS, which incorporates a new sampling probability distribution to sample more ambiguous negatives.Extensive experiments on four public and one industry datasets show the effectiveness of our approach.We made the code and models publicly available in \url{https://github.com/microsoft/SimXNS}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2022-simans">
<titleInfo>
<title>SimANS: Simple Ambiguous Negatives Sampling for Dense Text Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kun</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yeyun</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wayne</namePart>
<namePart type="given">Xin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yelong</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anlei</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingwen</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rangan</namePart>
<namePart type="family">Majumder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ji-rong</namePart>
<namePart type="family">Wen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sampling proper negatives from a large document pool is vital to effectively train a dense retrieval model. However, existing negative sampling strategies suffer from the uninformative or false negative problem. In this work, we empirically show that according to the measured relevance scores, the negatives ranked around the positives are generally more informative and less likely to be false negatives. Intuitively, these negatives are not too hard (may be false negatives) or too easy (uninformative). They are the ambiguous negatives and need more attention during training.Thus, we propose a simple ambiguous negatives sampling method, SimANS, which incorporates a new sampling probability distribution to sample more ambiguous negatives.Extensive experiments on four public and one industry datasets show the effectiveness of our approach.We made the code and models publicly available in https://github.com/microsoft/SimXNS.</abstract>
<identifier type="citekey">zhou-etal-2022-simans</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-industry.56</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-industry.56</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>548</start>
<end>559</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SimANS: Simple Ambiguous Negatives Sampling for Dense Text Retrieval
%A Zhou, Kun
%A Gong, Yeyun
%A Liu, Xiao
%A Zhao, Wayne Xin
%A Shen, Yelong
%A Dong, Anlei
%A Lu, Jingwen
%A Majumder, Rangan
%A Wen, Ji-rong
%A Duan, Nan
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhou-etal-2022-simans
%X Sampling proper negatives from a large document pool is vital to effectively train a dense retrieval model. However, existing negative sampling strategies suffer from the uninformative or false negative problem. In this work, we empirically show that according to the measured relevance scores, the negatives ranked around the positives are generally more informative and less likely to be false negatives. Intuitively, these negatives are not too hard (may be false negatives) or too easy (uninformative). They are the ambiguous negatives and need more attention during training.Thus, we propose a simple ambiguous negatives sampling method, SimANS, which incorporates a new sampling probability distribution to sample more ambiguous negatives.Extensive experiments on four public and one industry datasets show the effectiveness of our approach.We made the code and models publicly available in https://github.com/microsoft/SimXNS.
%R 10.18653/v1/2022.emnlp-industry.56
%U https://aclanthology.org/2022.emnlp-industry.56
%U https://doi.org/10.18653/v1/2022.emnlp-industry.56
%P 548-559
Markdown (Informal)
[SimANS: Simple Ambiguous Negatives Sampling for Dense Text Retrieval](https://aclanthology.org/2022.emnlp-industry.56) (Zhou et al., EMNLP 2022)
ACL
- Kun Zhou, Yeyun Gong, Xiao Liu, Wayne Xin Zhao, Yelong Shen, Anlei Dong, Jingwen Lu, Rangan Majumder, Ji-rong Wen, and Nan Duan. 2022. SimANS: Simple Ambiguous Negatives Sampling for Dense Text Retrieval. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 548–559, Abu Dhabi, UAE. Association for Computational Linguistics.