@inproceedings{sun-etal-2025-mitigating,
title = "Mitigating Shortcut Learning via Smart Data Augmentation based on Large Language Model",
author = "Sun, Xinyi and
Tan, Hongye and
Guo, Yaxin and
Qiang, Pengpeng and
Li, Ru and
Zhang, Hu",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.543/",
pages = "8160--8172",
abstract = "Data-driven pre-trained language models typically perform shortcut learning wherein they rely on the spurious correlations between the data and the ground truth. This reliance can undermine the robustness and generalization of the model. To address this issue, data augmentation emerges as a promising solution. By integrating anti-shortcut data to the training set, the models' shortcut-induced biases can be mitigated. However, existing methods encounter three challenges: 1) Manual definition of shortcuts is tailored to particular datasets, restricting generalization. 2) The inherent confirmation bias during model training hampers the effectiveness of data augmentation. 3) Insufficient exploration of the relationship between the model performance and the augmented data quantity may result in excessive data consumption. To tackle these challenges, we propose a method of Smart Data Augmentation based on Large Language Models (SAug-LLM). It leverages the LLMs to autonomously identify shortcuts and generate their anti-shortcut counterparts. In addition, the dual validation is employed to mitigate the confirmation bias during the model retraining. Furthermore, the data augmentation process is optimized to effectively rectify model biases while minimizing data consumption. We validate the effectiveness and generalization of our method through extensive experiments across various natural language processing tasks, demonstrating an average performance improvement of 5.61{\%}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2025-mitigating">
<titleInfo>
<title>Mitigating Shortcut Learning via Smart Data Augmentation based on Large Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongye</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaxin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengpeng</namePart>
<namePart type="family">Qiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ru</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data-driven pre-trained language models typically perform shortcut learning wherein they rely on the spurious correlations between the data and the ground truth. This reliance can undermine the robustness and generalization of the model. To address this issue, data augmentation emerges as a promising solution. By integrating anti-shortcut data to the training set, the models’ shortcut-induced biases can be mitigated. However, existing methods encounter three challenges: 1) Manual definition of shortcuts is tailored to particular datasets, restricting generalization. 2) The inherent confirmation bias during model training hampers the effectiveness of data augmentation. 3) Insufficient exploration of the relationship between the model performance and the augmented data quantity may result in excessive data consumption. To tackle these challenges, we propose a method of Smart Data Augmentation based on Large Language Models (SAug-LLM). It leverages the LLMs to autonomously identify shortcuts and generate their anti-shortcut counterparts. In addition, the dual validation is employed to mitigate the confirmation bias during the model retraining. Furthermore, the data augmentation process is optimized to effectively rectify model biases while minimizing data consumption. We validate the effectiveness and generalization of our method through extensive experiments across various natural language processing tasks, demonstrating an average performance improvement of 5.61%.</abstract>
<identifier type="citekey">sun-etal-2025-mitigating</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.543/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>8160</start>
<end>8172</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating Shortcut Learning via Smart Data Augmentation based on Large Language Model
%A Sun, Xinyi
%A Tan, Hongye
%A Guo, Yaxin
%A Qiang, Pengpeng
%A Li, Ru
%A Zhang, Hu
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F sun-etal-2025-mitigating
%X Data-driven pre-trained language models typically perform shortcut learning wherein they rely on the spurious correlations between the data and the ground truth. This reliance can undermine the robustness and generalization of the model. To address this issue, data augmentation emerges as a promising solution. By integrating anti-shortcut data to the training set, the models’ shortcut-induced biases can be mitigated. However, existing methods encounter three challenges: 1) Manual definition of shortcuts is tailored to particular datasets, restricting generalization. 2) The inherent confirmation bias during model training hampers the effectiveness of data augmentation. 3) Insufficient exploration of the relationship between the model performance and the augmented data quantity may result in excessive data consumption. To tackle these challenges, we propose a method of Smart Data Augmentation based on Large Language Models (SAug-LLM). It leverages the LLMs to autonomously identify shortcuts and generate their anti-shortcut counterparts. In addition, the dual validation is employed to mitigate the confirmation bias during the model retraining. Furthermore, the data augmentation process is optimized to effectively rectify model biases while minimizing data consumption. We validate the effectiveness and generalization of our method through extensive experiments across various natural language processing tasks, demonstrating an average performance improvement of 5.61%.
%U https://aclanthology.org/2025.coling-main.543/
%P 8160-8172
Markdown (Informal)
[Mitigating Shortcut Learning via Smart Data Augmentation based on Large Language Model](https://aclanthology.org/2025.coling-main.543/) (Sun et al., COLING 2025)
ACL