@inproceedings{li-etal-2026-difra,
title = "{D}i{FR}a: A Unified Framework for Harmonizing Semantic Diversity and Factual Consistency in Question-Answer Generation",
author = "Li, Zhenqin and
Ding, ShengYong and
Li, Shuangyin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1493/",
pages = "29857--29875",
ISBN = "979-8-89176-395-1",
abstract = "Question-Answer Generation (QAG) is essential for alleviating the cold-start problem in domain-specific large language model (LLM) post-training, where high-quality data is severely scarce.Effective training samples include rich semantic diversity and rigorous factual consistency.Thus, it is necessary to consider the inherent tension between semantic breadth and factual fidelity.However, it is extremely challenging to trade off semantic diversity against factual consistency, in that generalization across the semantic space must be achieved effectively and reliably, and factual integrity must be ensured as well.To address this issue, we propose an effective framework, namely DiFRa, that integrates continuous concept diffusion with discrete knowledge graph constraints to balance semantic diversity and factual consistency.Specifically, the proposed DiFRa models discrete concepts as a continuous latent distribution to sample embeddings that capture rich semantic variations, and constructs a refined knowledge graph as explicit factual constraints.Then, a diversity and consistency aware mechanism is designed to dynamically integrate both embeddings and the knowledge graph for QA pairs generation.Furthermore, we introduce SeFa, which harmonizes semantic entropy and consistency scores to quantify the trade-off between diversity and correctness.Extensive experiments demonstrate that DiFRa consistently outperforms the baseline models, validating its efficacy in reconciling the tension to generate semantically diverse and factually consistent QA pairs. The source code is publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-difra">
<titleInfo>
<title>DiFRa: A Unified Framework for Harmonizing Semantic Diversity and Factual Consistency in Question-Answer Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhenqin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ShengYong</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuangyin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Question-Answer Generation (QAG) is essential for alleviating the cold-start problem in domain-specific large language model (LLM) post-training, where high-quality data is severely scarce.Effective training samples include rich semantic diversity and rigorous factual consistency.Thus, it is necessary to consider the inherent tension between semantic breadth and factual fidelity.However, it is extremely challenging to trade off semantic diversity against factual consistency, in that generalization across the semantic space must be achieved effectively and reliably, and factual integrity must be ensured as well.To address this issue, we propose an effective framework, namely DiFRa, that integrates continuous concept diffusion with discrete knowledge graph constraints to balance semantic diversity and factual consistency.Specifically, the proposed DiFRa models discrete concepts as a continuous latent distribution to sample embeddings that capture rich semantic variations, and constructs a refined knowledge graph as explicit factual constraints.Then, a diversity and consistency aware mechanism is designed to dynamically integrate both embeddings and the knowledge graph for QA pairs generation.Furthermore, we introduce SeFa, which harmonizes semantic entropy and consistency scores to quantify the trade-off between diversity and correctness.Extensive experiments demonstrate that DiFRa consistently outperforms the baseline models, validating its efficacy in reconciling the tension to generate semantically diverse and factually consistent QA pairs. The source code is publicly available.</abstract>
<identifier type="citekey">li-etal-2026-difra</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1493/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>29857</start>
<end>29875</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DiFRa: A Unified Framework for Harmonizing Semantic Diversity and Factual Consistency in Question-Answer Generation
%A Li, Zhenqin
%A Ding, ShengYong
%A Li, Shuangyin
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-difra
%X Question-Answer Generation (QAG) is essential for alleviating the cold-start problem in domain-specific large language model (LLM) post-training, where high-quality data is severely scarce.Effective training samples include rich semantic diversity and rigorous factual consistency.Thus, it is necessary to consider the inherent tension between semantic breadth and factual fidelity.However, it is extremely challenging to trade off semantic diversity against factual consistency, in that generalization across the semantic space must be achieved effectively and reliably, and factual integrity must be ensured as well.To address this issue, we propose an effective framework, namely DiFRa, that integrates continuous concept diffusion with discrete knowledge graph constraints to balance semantic diversity and factual consistency.Specifically, the proposed DiFRa models discrete concepts as a continuous latent distribution to sample embeddings that capture rich semantic variations, and constructs a refined knowledge graph as explicit factual constraints.Then, a diversity and consistency aware mechanism is designed to dynamically integrate both embeddings and the knowledge graph for QA pairs generation.Furthermore, we introduce SeFa, which harmonizes semantic entropy and consistency scores to quantify the trade-off between diversity and correctness.Extensive experiments demonstrate that DiFRa consistently outperforms the baseline models, validating its efficacy in reconciling the tension to generate semantically diverse and factually consistent QA pairs. The source code is publicly available.
%U https://aclanthology.org/2026.findings-acl.1493/
%P 29857-29875
Markdown (Informal)
[DiFRa: A Unified Framework for Harmonizing Semantic Diversity and Factual Consistency in Question-Answer Generation](https://aclanthology.org/2026.findings-acl.1493/) (Li et al., Findings 2026)
ACL