@inproceedings{huang-etal-2026-integrating,
title = "Integrating Data Validation with Large Language Models for Regulation-Guided Tabular Anomaly Detection",
author = "Huang, Haoliang and
Cai, Zihuang and
Tang, Zhuo and
Liu, Yifan and
Tian, Chen and
Li, Kenli and
Chen, Changjian",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.297/",
pages = "6559--6581",
ISBN = "979-8-89176-390-6",
abstract = "In many real-world applications, such as medical insurance, many regulations exist that define how data should comply with certain standards. Auditors typically use these regulations to identify anomalies in tabular data. However, existing tabular anomaly detection methods often focus on detecting anomalies based on data distribution without considering regulatory compliance. In this paper, we introduce a new task, Regulation-guided Tabular Anomaly Detection, which leverages regulations to detect anomalies in tabular data. We also developed three new datasets for this task. To address this task, we present RegValidator, a training-free method that integrates data validation with large language models (LLMs) for detecting anomalies. In this process, the LLMs generate ideas for anomaly detection from a regulation perspective, while the data validation validates these ideas from a data distribution perspective. This process can be framed as a Budgeted Maximum Coverage problem, which can be solved by a constant-factor approximation algorithm with provable guarantees. Empirical results on the new datasets demonstrate that our method outperforms existing baselines. A field experiment in a commercial health insurance company also reveals the practical value of our method. Our code is available at https://github.com/hnu-vis/RegValidator."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2026-integrating">
<titleInfo>
<title>Integrating Data Validation with Large Language Models for Regulation-Guided Tabular Anomaly Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haoliang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihuang</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuo</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenli</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changjian</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>In many real-world applications, such as medical insurance, many regulations exist that define how data should comply with certain standards. Auditors typically use these regulations to identify anomalies in tabular data. However, existing tabular anomaly detection methods often focus on detecting anomalies based on data distribution without considering regulatory compliance. In this paper, we introduce a new task, Regulation-guided Tabular Anomaly Detection, which leverages regulations to detect anomalies in tabular data. We also developed three new datasets for this task. To address this task, we present RegValidator, a training-free method that integrates data validation with large language models (LLMs) for detecting anomalies. In this process, the LLMs generate ideas for anomaly detection from a regulation perspective, while the data validation validates these ideas from a data distribution perspective. This process can be framed as a Budgeted Maximum Coverage problem, which can be solved by a constant-factor approximation algorithm with provable guarantees. Empirical results on the new datasets demonstrate that our method outperforms existing baselines. A field experiment in a commercial health insurance company also reveals the practical value of our method. Our code is available at https://github.com/hnu-vis/RegValidator.</abstract>
<identifier type="citekey">huang-etal-2026-integrating</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.297/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6559</start>
<end>6581</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Data Validation with Large Language Models for Regulation-Guided Tabular Anomaly Detection
%A Huang, Haoliang
%A Cai, Zihuang
%A Tang, Zhuo
%A Liu, Yifan
%A Tian, Chen
%A Li, Kenli
%A Chen, Changjian
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F huang-etal-2026-integrating
%X In many real-world applications, such as medical insurance, many regulations exist that define how data should comply with certain standards. Auditors typically use these regulations to identify anomalies in tabular data. However, existing tabular anomaly detection methods often focus on detecting anomalies based on data distribution without considering regulatory compliance. In this paper, we introduce a new task, Regulation-guided Tabular Anomaly Detection, which leverages regulations to detect anomalies in tabular data. We also developed three new datasets for this task. To address this task, we present RegValidator, a training-free method that integrates data validation with large language models (LLMs) for detecting anomalies. In this process, the LLMs generate ideas for anomaly detection from a regulation perspective, while the data validation validates these ideas from a data distribution perspective. This process can be framed as a Budgeted Maximum Coverage problem, which can be solved by a constant-factor approximation algorithm with provable guarantees. Empirical results on the new datasets demonstrate that our method outperforms existing baselines. A field experiment in a commercial health insurance company also reveals the practical value of our method. Our code is available at https://github.com/hnu-vis/RegValidator.
%U https://aclanthology.org/2026.acl-long.297/
%P 6559-6581
Markdown (Informal)
[Integrating Data Validation with Large Language Models for Regulation-Guided Tabular Anomaly Detection](https://aclanthology.org/2026.acl-long.297/) (Huang et al., ACL 2026)
ACL
- Haoliang Huang, Zihuang Cai, Zhuo Tang, Yifan Liu, Chen Tian, Kenli Li, and Changjian Chen. 2026. Integrating Data Validation with Large Language Models for Regulation-Guided Tabular Anomaly Detection. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6559–6581, San Diego, California, United States. Association for Computational Linguistics.