@inproceedings{arslan-etal-2025-using,
title = "Using {LLM}s to Advance Idiom Corpus Construction",
author = {Arslan, Do{\u{g}}ukan and
{\c{C}}akmak, H{\"u}seyin An{\i}l and
Eryigit, Gulsen and
Nivre, Joakim},
editor = {Ojha, Atul Kr. and
Giouli, Voula and
Mititelu, Verginica Barbu and
Constant, Mathieu and
Korvel, Gra{\v{z}}ina and
Do{\u{g}}ru{\"o}z, A. Seza and
Rademaker, Alexandre},
booktitle = "Proceedings of the 21st Workshop on Multiword Expressions (MWE 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, U.S.A.",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.mwe-1.4/",
doi = "10.18653/v1/2025.mwe-1.4",
pages = "21--31",
ISBN = "979-8-89176-243-5",
abstract = "Idiom corpora typically include both idiomatic and literal examples of potentially idiomatic expressions, but creating such corpora traditionally requires substantial expert effort and cost. In this article, we explore the use of large language models (LLMs) to generate synthetic idiom corpora as a more time- and cost-efficient alternative. We evaluate the effectiveness of synthetic data in training task-specific models and testing GPT-4 in few-shot prompting setting using synthetic data for idiomaticity detection. Our findings reveal that although models trained on synthetic data perform worse than those trained on human-generated data, synthetic data generation offers considerable advantages in terms of cost and time. Specifically, task-specific idiomaticity detection models trained on synthetic data outperform the general-purpose LLM that generated the data when evaluated in a zero-shot setting, achieving an average improvement of 11 percentage points across four languages. Moreover, synthetic data enhances the LLM{'}s performance, enabling it to match the task-specific models trained with synthetic data when few-shot prompting is applied."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arslan-etal-2025-using">
<titleInfo>
<title>Using LLMs to Advance Idiom Corpus Construction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Doğukan</namePart>
<namePart type="family">Arslan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hüseyin</namePart>
<namePart type="given">Anıl</namePart>
<namePart type="family">Çakmak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gulsen</namePart>
<namePart type="family">Eryigit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Nivre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop on Multiword Expressions (MWE 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Voula</namePart>
<namePart type="family">Giouli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verginica</namePart>
<namePart type="given">Barbu</namePart>
<namePart type="family">Mititelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gražina</namePart>
<namePart type="family">Korvel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Rademaker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, U.S.A.</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-243-5</identifier>
</relatedItem>
<abstract>Idiom corpora typically include both idiomatic and literal examples of potentially idiomatic expressions, but creating such corpora traditionally requires substantial expert effort and cost. In this article, we explore the use of large language models (LLMs) to generate synthetic idiom corpora as a more time- and cost-efficient alternative. We evaluate the effectiveness of synthetic data in training task-specific models and testing GPT-4 in few-shot prompting setting using synthetic data for idiomaticity detection. Our findings reveal that although models trained on synthetic data perform worse than those trained on human-generated data, synthetic data generation offers considerable advantages in terms of cost and time. Specifically, task-specific idiomaticity detection models trained on synthetic data outperform the general-purpose LLM that generated the data when evaluated in a zero-shot setting, achieving an average improvement of 11 percentage points across four languages. Moreover, synthetic data enhances the LLM’s performance, enabling it to match the task-specific models trained with synthetic data when few-shot prompting is applied.</abstract>
<identifier type="citekey">arslan-etal-2025-using</identifier>
<identifier type="doi">10.18653/v1/2025.mwe-1.4</identifier>
<location>
<url>https://aclanthology.org/2025.mwe-1.4/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>21</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using LLMs to Advance Idiom Corpus Construction
%A Arslan, Doğukan
%A Çakmak, Hüseyin Anıl
%A Eryigit, Gulsen
%A Nivre, Joakim
%Y Ojha, Atul Kr.
%Y Giouli, Voula
%Y Mititelu, Verginica Barbu
%Y Constant, Mathieu
%Y Korvel, Gražina
%Y Doğruöz, A. Seza
%Y Rademaker, Alexandre
%S Proceedings of the 21st Workshop on Multiword Expressions (MWE 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, U.S.A.
%@ 979-8-89176-243-5
%F arslan-etal-2025-using
%X Idiom corpora typically include both idiomatic and literal examples of potentially idiomatic expressions, but creating such corpora traditionally requires substantial expert effort and cost. In this article, we explore the use of large language models (LLMs) to generate synthetic idiom corpora as a more time- and cost-efficient alternative. We evaluate the effectiveness of synthetic data in training task-specific models and testing GPT-4 in few-shot prompting setting using synthetic data for idiomaticity detection. Our findings reveal that although models trained on synthetic data perform worse than those trained on human-generated data, synthetic data generation offers considerable advantages in terms of cost and time. Specifically, task-specific idiomaticity detection models trained on synthetic data outperform the general-purpose LLM that generated the data when evaluated in a zero-shot setting, achieving an average improvement of 11 percentage points across four languages. Moreover, synthetic data enhances the LLM’s performance, enabling it to match the task-specific models trained with synthetic data when few-shot prompting is applied.
%R 10.18653/v1/2025.mwe-1.4
%U https://aclanthology.org/2025.mwe-1.4/
%U https://doi.org/10.18653/v1/2025.mwe-1.4
%P 21-31
Markdown (Informal)
[Using LLMs to Advance Idiom Corpus Construction](https://aclanthology.org/2025.mwe-1.4/) (Arslan et al., MWE 2025)
ACL
- Doğukan Arslan, Hüseyin Anıl Çakmak, Gulsen Eryigit, and Joakim Nivre. 2025. Using LLMs to Advance Idiom Corpus Construction. In Proceedings of the 21st Workshop on Multiword Expressions (MWE 2025), pages 21–31, Albuquerque, New Mexico, U.S.A.. Association for Computational Linguistics.