@inproceedings{shu-etal-2023-intent,
title = "An Intent-based and Annotation-free Method for Duplicate Question Detection in {CQA} Forums",
author = "Shu, Yubo and
Gu, Hansu and
Zhang, Peng and
Lu, Tun and
Gu, Ning",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.596",
doi = "10.18653/v1/2023.findings-emnlp.596",
pages = "8889--8899",
abstract = "With the advent of large language models (LLMs), Community Question Answering (CQA) forums offer well-curated questions and answers that can be utilized for instruction-tuning, effectively training LLMs to be aligned with human intents. However, the issue of duplicate questions arises as the volume of content within CQA continues to grow, posing a threat to content quality. Recent research highlights the benefits of detecting and eliminating duplicate content. It not only enhances the LLMs{'} ability to generalize across diverse intents but also improves the efficiency of training data utilization while addressing concerns related to information leakage. However, existing methods for detecting duplicate questions in CQA typically rely on generic text-pair matching models, overlooking the intent behind the questions. In this paper, we propose a novel intent-based duplication detector named Intent-DQD that comprehensively leverages intent information to address the problem of duplicate question detection in CQA. Intent-DQD first leverages the characteristics in CQA forums and extracts training labels to recognize and match intents without human annotation. Intent-DQD then effectively aggregates intent-level relations and establishes question-level relations to enable intent-aware duplication detection. Experimental results on fifteen distinct domains from both CQADupStack and Stack Overflow datasets demonstrate the effectiveness of Intent-DQD. Reproducible codes and datasets will be released upon publication of the paper.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shu-etal-2023-intent">
<titleInfo>
<title>An Intent-based and Annotation-free Method for Duplicate Question Detection in CQA Forums</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yubo</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansu</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tun</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ning</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the advent of large language models (LLMs), Community Question Answering (CQA) forums offer well-curated questions and answers that can be utilized for instruction-tuning, effectively training LLMs to be aligned with human intents. However, the issue of duplicate questions arises as the volume of content within CQA continues to grow, posing a threat to content quality. Recent research highlights the benefits of detecting and eliminating duplicate content. It not only enhances the LLMs’ ability to generalize across diverse intents but also improves the efficiency of training data utilization while addressing concerns related to information leakage. However, existing methods for detecting duplicate questions in CQA typically rely on generic text-pair matching models, overlooking the intent behind the questions. In this paper, we propose a novel intent-based duplication detector named Intent-DQD that comprehensively leverages intent information to address the problem of duplicate question detection in CQA. Intent-DQD first leverages the characteristics in CQA forums and extracts training labels to recognize and match intents without human annotation. Intent-DQD then effectively aggregates intent-level relations and establishes question-level relations to enable intent-aware duplication detection. Experimental results on fifteen distinct domains from both CQADupStack and Stack Overflow datasets demonstrate the effectiveness of Intent-DQD. Reproducible codes and datasets will be released upon publication of the paper.</abstract>
<identifier type="citekey">shu-etal-2023-intent</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.596</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.596</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>8889</start>
<end>8899</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Intent-based and Annotation-free Method for Duplicate Question Detection in CQA Forums
%A Shu, Yubo
%A Gu, Hansu
%A Zhang, Peng
%A Lu, Tun
%A Gu, Ning
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F shu-etal-2023-intent
%X With the advent of large language models (LLMs), Community Question Answering (CQA) forums offer well-curated questions and answers that can be utilized for instruction-tuning, effectively training LLMs to be aligned with human intents. However, the issue of duplicate questions arises as the volume of content within CQA continues to grow, posing a threat to content quality. Recent research highlights the benefits of detecting and eliminating duplicate content. It not only enhances the LLMs’ ability to generalize across diverse intents but also improves the efficiency of training data utilization while addressing concerns related to information leakage. However, existing methods for detecting duplicate questions in CQA typically rely on generic text-pair matching models, overlooking the intent behind the questions. In this paper, we propose a novel intent-based duplication detector named Intent-DQD that comprehensively leverages intent information to address the problem of duplicate question detection in CQA. Intent-DQD first leverages the characteristics in CQA forums and extracts training labels to recognize and match intents without human annotation. Intent-DQD then effectively aggregates intent-level relations and establishes question-level relations to enable intent-aware duplication detection. Experimental results on fifteen distinct domains from both CQADupStack and Stack Overflow datasets demonstrate the effectiveness of Intent-DQD. Reproducible codes and datasets will be released upon publication of the paper.
%R 10.18653/v1/2023.findings-emnlp.596
%U https://aclanthology.org/2023.findings-emnlp.596
%U https://doi.org/10.18653/v1/2023.findings-emnlp.596
%P 8889-8899
Markdown (Informal)
[An Intent-based and Annotation-free Method for Duplicate Question Detection in CQA Forums](https://aclanthology.org/2023.findings-emnlp.596) (Shu et al., Findings 2023)
ACL