@inproceedings{han-etal-2025-rule,
title = "Rule Discovery for Natural Language Inference Data Generation Using Out-of-Distribution Detection",
author = "Han, Juyoung and
Hwang, Hyunsun and
Lee, Changki",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1319/",
doi = "10.18653/v1/2025.emnlp-main.1319",
pages = "25971--25991",
ISBN = "979-8-89176-332-6",
abstract = "Natural Language Inference (NLI) is a fundamental task in Natural Language Processing (NLP), yet adapting NLI models to new domains remains challenging due to the high cost of collecting domain-specific training data. While prior work proposed 15 sentence transformation rules to automate training data generation, these rules insufficiently capture the diversity of natural language. We propose a novel framework that combines Out-of-Distribution (OOD) detection and BERT-based clustering to identify premise{--}hypothesis pairs in the SNLI dataset that are not covered by existing rules and to discover four new transformation rules from them. Using these rules with Chain-of-Thought (CoT) prompting and Large Language Models (LLMs), we generate high-quality training data and augment the SNLI dataset. Our method yields consistent performance improvements across dataset sizes, achieving +0.85{\%}p accuracy on 2k and +0.15{\%}p on 550k samples. Furthermore, a distribution-aware augmentation strategy enhances performance across all scales. Beyond manual explanations, we extend our framework to automatically generated explanations (CoT-Ex), demonstrating that they provide a scalable alternative to human-written explanations and enable reliable rule discovery."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2025-rule">
<titleInfo>
<title>Rule Discovery for Natural Language Inference Data Generation Using Out-of-Distribution Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juyoung</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunsun</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changki</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Natural Language Inference (NLI) is a fundamental task in Natural Language Processing (NLP), yet adapting NLI models to new domains remains challenging due to the high cost of collecting domain-specific training data. While prior work proposed 15 sentence transformation rules to automate training data generation, these rules insufficiently capture the diversity of natural language. We propose a novel framework that combines Out-of-Distribution (OOD) detection and BERT-based clustering to identify premise–hypothesis pairs in the SNLI dataset that are not covered by existing rules and to discover four new transformation rules from them. Using these rules with Chain-of-Thought (CoT) prompting and Large Language Models (LLMs), we generate high-quality training data and augment the SNLI dataset. Our method yields consistent performance improvements across dataset sizes, achieving +0.85%p accuracy on 2k and +0.15%p on 550k samples. Furthermore, a distribution-aware augmentation strategy enhances performance across all scales. Beyond manual explanations, we extend our framework to automatically generated explanations (CoT-Ex), demonstrating that they provide a scalable alternative to human-written explanations and enable reliable rule discovery.</abstract>
<identifier type="citekey">han-etal-2025-rule</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1319</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1319/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25971</start>
<end>25991</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rule Discovery for Natural Language Inference Data Generation Using Out-of-Distribution Detection
%A Han, Juyoung
%A Hwang, Hyunsun
%A Lee, Changki
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F han-etal-2025-rule
%X Natural Language Inference (NLI) is a fundamental task in Natural Language Processing (NLP), yet adapting NLI models to new domains remains challenging due to the high cost of collecting domain-specific training data. While prior work proposed 15 sentence transformation rules to automate training data generation, these rules insufficiently capture the diversity of natural language. We propose a novel framework that combines Out-of-Distribution (OOD) detection and BERT-based clustering to identify premise–hypothesis pairs in the SNLI dataset that are not covered by existing rules and to discover four new transformation rules from them. Using these rules with Chain-of-Thought (CoT) prompting and Large Language Models (LLMs), we generate high-quality training data and augment the SNLI dataset. Our method yields consistent performance improvements across dataset sizes, achieving +0.85%p accuracy on 2k and +0.15%p on 550k samples. Furthermore, a distribution-aware augmentation strategy enhances performance across all scales. Beyond manual explanations, we extend our framework to automatically generated explanations (CoT-Ex), demonstrating that they provide a scalable alternative to human-written explanations and enable reliable rule discovery.
%R 10.18653/v1/2025.emnlp-main.1319
%U https://aclanthology.org/2025.emnlp-main.1319/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1319
%P 25971-25991
Markdown (Informal)
[Rule Discovery for Natural Language Inference Data Generation Using Out-of-Distribution Detection](https://aclanthology.org/2025.emnlp-main.1319/) (Han et al., EMNLP 2025)
ACL