@inproceedings{chen-etal-2025-essential,
title = "What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices",
author = "Chen, Zhi and
Chen, Qiguang and
Qin, Libo and
Guo, Qipeng and
Lv, Haijun and
Zou, Yicheng and
Yan, Hang and
Chen, Kai and
Lin, Dahua",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1316/",
doi = "10.18653/v1/2025.acl-long.1316",
pages = "27129--27151",
ISBN = "979-8-89176-251-0",
abstract = "Recent advancements in large language models (LLMs) with extended context windows have significantly improved various tasks. To improve long-context capabilities, much work focuses on augmenting LLM{'}s capabilities with synthetic data. Existing methods often leverage the Self-Instruct framework to generate long-context instruction-tuning data. However, our preliminary experiments show that fewer than 35{\%} of samples generated by Qwen-2-72B are multi-hop, and over 40{\%} exhibit poor quality, limiting comprehensive understanding and further research. To address this, we propose the Multi-agent Interactive Multi-hop Generation (MIMG) framework, which integrates a quality verification agent, a single-hop question generation agent, a multiple question sampling strategy, and a multi-hop question merger agent. This framework significantly improves data quality, with high-quality, multi-hop, and diverse data. Furthermore, we conduct a thorough analysis of document selection, question merging, and validation techniques through extensive experiments across various models. Our results demonstrate that synthetic high-quality long-context instruction data can enhance model performance, surpassing even models trained on larger amounts of human-annotated data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-essential">
<titleInfo>
<title>What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiguang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Libo</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qipeng</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haijun</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yicheng</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hang</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dahua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Recent advancements in large language models (LLMs) with extended context windows have significantly improved various tasks. To improve long-context capabilities, much work focuses on augmenting LLM’s capabilities with synthetic data. Existing methods often leverage the Self-Instruct framework to generate long-context instruction-tuning data. However, our preliminary experiments show that fewer than 35% of samples generated by Qwen-2-72B are multi-hop, and over 40% exhibit poor quality, limiting comprehensive understanding and further research. To address this, we propose the Multi-agent Interactive Multi-hop Generation (MIMG) framework, which integrates a quality verification agent, a single-hop question generation agent, a multiple question sampling strategy, and a multi-hop question merger agent. This framework significantly improves data quality, with high-quality, multi-hop, and diverse data. Furthermore, we conduct a thorough analysis of document selection, question merging, and validation techniques through extensive experiments across various models. Our results demonstrate that synthetic high-quality long-context instruction data can enhance model performance, surpassing even models trained on larger amounts of human-annotated data.</abstract>
<identifier type="citekey">chen-etal-2025-essential</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1316</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1316/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>27129</start>
<end>27151</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices
%A Chen, Zhi
%A Chen, Qiguang
%A Qin, Libo
%A Guo, Qipeng
%A Lv, Haijun
%A Zou, Yicheng
%A Yan, Hang
%A Chen, Kai
%A Lin, Dahua
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chen-etal-2025-essential
%X Recent advancements in large language models (LLMs) with extended context windows have significantly improved various tasks. To improve long-context capabilities, much work focuses on augmenting LLM’s capabilities with synthetic data. Existing methods often leverage the Self-Instruct framework to generate long-context instruction-tuning data. However, our preliminary experiments show that fewer than 35% of samples generated by Qwen-2-72B are multi-hop, and over 40% exhibit poor quality, limiting comprehensive understanding and further research. To address this, we propose the Multi-agent Interactive Multi-hop Generation (MIMG) framework, which integrates a quality verification agent, a single-hop question generation agent, a multiple question sampling strategy, and a multi-hop question merger agent. This framework significantly improves data quality, with high-quality, multi-hop, and diverse data. Furthermore, we conduct a thorough analysis of document selection, question merging, and validation techniques through extensive experiments across various models. Our results demonstrate that synthetic high-quality long-context instruction data can enhance model performance, surpassing even models trained on larger amounts of human-annotated data.
%R 10.18653/v1/2025.acl-long.1316
%U https://aclanthology.org/2025.acl-long.1316/
%U https://doi.org/10.18653/v1/2025.acl-long.1316
%P 27129-27151
Markdown (Informal)
[What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices](https://aclanthology.org/2025.acl-long.1316/) (Chen et al., ACL 2025)
ACL
- Zhi Chen, Qiguang Chen, Libo Qin, Qipeng Guo, Haijun Lv, Yicheng Zou, Hang Yan, Kai Chen, and Dahua Lin. 2025. What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 27129–27151, Vienna, Austria. Association for Computational Linguistics.