@inproceedings{xie-etal-2024-adversarial,
title = "Adversarial Math Word Problem Generation",
author = "Xie, Roy and
Huang, Chengxuan and
Wang, Junlin and
Dhingra, Bhuwan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.292",
doi = "10.18653/v1/2024.findings-emnlp.292",
pages = "5075--5093",
abstract = "Large language models (LLMs) have significantly transformed the educational landscape. As current plagiarism detection tools struggle to keep pace with LLMs{'} rapid advancements, the educational community faces the challenge of assessing students{'} true problem-solving abilities in the presence of LLMs. In this work, we explore a new paradigm for ensuring fair evaluation{---}generating adversarial examples which preserve the structure and difficulty of the original questions aimed for assessment, but are unsolvable by LLMs. Focusing on the domain of math word problems, we leverage abstract syntax trees to structurally generate adversarial examples that cause LLMs to produce incorrect answers by simply editing the numeric values in the problems. We conduct experiments on various open- and closed-source LLMs, quantitatively and qualitatively demonstrating that our method significantly degrades their math problem-solving ability. We identify shared vulnerabilities among LLMs and propose a cost-effective approach to attack high-cost models. Additionally, we conduct automatic analysis to investigate the cause of failure, providing further insights into the limitations of LLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xie-etal-2024-adversarial">
<titleInfo>
<title>Adversarial Math Word Problem Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roy</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengxuan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junlin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhuwan</namePart>
<namePart type="family">Dhingra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) have significantly transformed the educational landscape. As current plagiarism detection tools struggle to keep pace with LLMs’ rapid advancements, the educational community faces the challenge of assessing students’ true problem-solving abilities in the presence of LLMs. In this work, we explore a new paradigm for ensuring fair evaluation—generating adversarial examples which preserve the structure and difficulty of the original questions aimed for assessment, but are unsolvable by LLMs. Focusing on the domain of math word problems, we leverage abstract syntax trees to structurally generate adversarial examples that cause LLMs to produce incorrect answers by simply editing the numeric values in the problems. We conduct experiments on various open- and closed-source LLMs, quantitatively and qualitatively demonstrating that our method significantly degrades their math problem-solving ability. We identify shared vulnerabilities among LLMs and propose a cost-effective approach to attack high-cost models. Additionally, we conduct automatic analysis to investigate the cause of failure, providing further insights into the limitations of LLMs.</abstract>
<identifier type="citekey">xie-etal-2024-adversarial</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.292</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.292</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5075</start>
<end>5093</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adversarial Math Word Problem Generation
%A Xie, Roy
%A Huang, Chengxuan
%A Wang, Junlin
%A Dhingra, Bhuwan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F xie-etal-2024-adversarial
%X Large language models (LLMs) have significantly transformed the educational landscape. As current plagiarism detection tools struggle to keep pace with LLMs’ rapid advancements, the educational community faces the challenge of assessing students’ true problem-solving abilities in the presence of LLMs. In this work, we explore a new paradigm for ensuring fair evaluation—generating adversarial examples which preserve the structure and difficulty of the original questions aimed for assessment, but are unsolvable by LLMs. Focusing on the domain of math word problems, we leverage abstract syntax trees to structurally generate adversarial examples that cause LLMs to produce incorrect answers by simply editing the numeric values in the problems. We conduct experiments on various open- and closed-source LLMs, quantitatively and qualitatively demonstrating that our method significantly degrades their math problem-solving ability. We identify shared vulnerabilities among LLMs and propose a cost-effective approach to attack high-cost models. Additionally, we conduct automatic analysis to investigate the cause of failure, providing further insights into the limitations of LLMs.
%R 10.18653/v1/2024.findings-emnlp.292
%U https://aclanthology.org/2024.findings-emnlp.292
%U https://doi.org/10.18653/v1/2024.findings-emnlp.292
%P 5075-5093
Markdown (Informal)
[Adversarial Math Word Problem Generation](https://aclanthology.org/2024.findings-emnlp.292) (Xie et al., Findings 2024)
ACL
- Roy Xie, Chengxuan Huang, Junlin Wang, and Bhuwan Dhingra. 2024. Adversarial Math Word Problem Generation. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 5075–5093, Miami, Florida, USA. Association for Computational Linguistics.