@inproceedings{aslantas-gungor-2026-unified,
title = "A Unified {T}urkic Idiom Understanding Benchmark: Idiom Detection and Semantic Retrieval Across Five {T}urkic Languages",
author = {Aslanta{\c{s}}, G{\"o}zde and
Gungor, Tunga},
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.4/",
pages = "38--51",
ISBN = "979-8-89176-370-8",
abstract = "Idiomatic expressions are culturally grounded, semantically opaque, and difficult to interpret for multilingual natural language processing systems. Despite the large speaker population of Turkic languages, resources that focus on monolingual and cross-lingual idioms and their meanings are limited. We introduce the first unified benchmark for idiom understanding across Turkish, Azerbaijani, Turkmen, Gagauz, and Uzbek languages. The datasets compiled include token-level idiom span annotations. We develop models for idiom identification and semantic retrieval tasks. We evaluate seven models for idiom identification and nine embedding models for semantic retrieval tasks under several fine-tuning schemes using standard dense retrieval metrics. This benchmark provides a basis for studying idiomatic phenomena in Turkic languages and clarifies how idiomatic meanings are shared, altered, or diverge across languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aslantas-gungor-2026-unified">
<titleInfo>
<title>A Unified Turkic Idiom Understanding Benchmark: Idiom Detection and Semantic Retrieval Across Five Turkic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gözde</namePart>
<namePart type="family">Aslantaş</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tunga</namePart>
<namePart type="family">Gungor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>Idiomatic expressions are culturally grounded, semantically opaque, and difficult to interpret for multilingual natural language processing systems. Despite the large speaker population of Turkic languages, resources that focus on monolingual and cross-lingual idioms and their meanings are limited. We introduce the first unified benchmark for idiom understanding across Turkish, Azerbaijani, Turkmen, Gagauz, and Uzbek languages. The datasets compiled include token-level idiom span annotations. We develop models for idiom identification and semantic retrieval tasks. We evaluate seven models for idiom identification and nine embedding models for semantic retrieval tasks under several fine-tuning schemes using standard dense retrieval metrics. This benchmark provides a basis for studying idiomatic phenomena in Turkic languages and clarifies how idiomatic meanings are shared, altered, or diverge across languages.</abstract>
<identifier type="citekey">aslantas-gungor-2026-unified</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.4/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>38</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Unified Turkic Idiom Understanding Benchmark: Idiom Detection and Semantic Retrieval Across Five Turkic Languages
%A Aslantaş, Gözde
%A Gungor, Tunga
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F aslantas-gungor-2026-unified
%X Idiomatic expressions are culturally grounded, semantically opaque, and difficult to interpret for multilingual natural language processing systems. Despite the large speaker population of Turkic languages, resources that focus on monolingual and cross-lingual idioms and their meanings are limited. We introduce the first unified benchmark for idiom understanding across Turkish, Azerbaijani, Turkmen, Gagauz, and Uzbek languages. The datasets compiled include token-level idiom span annotations. We develop models for idiom identification and semantic retrieval tasks. We evaluate seven models for idiom identification and nine embedding models for semantic retrieval tasks under several fine-tuning schemes using standard dense retrieval metrics. This benchmark provides a basis for studying idiomatic phenomena in Turkic languages and clarifies how idiomatic meanings are shared, altered, or diverge across languages.
%U https://aclanthology.org/2026.sigturk-1.4/
%P 38-51
Markdown (Informal)
[A Unified Turkic Idiom Understanding Benchmark: Idiom Detection and Semantic Retrieval Across Five Turkic Languages](https://aclanthology.org/2026.sigturk-1.4/) (Aslantaş & Gungor, SIGTURK 2026)
ACL