@inproceedings{alizada-dubossarsky-2026-benchmarking,
title = "Benchmarking Hate Speech Detection in {A}zerbaijani with {T}urkish Cross-Lingual Transfer and Transformer Models",
author = "Alizada, Tural and
Dubossarsky, Haim",
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.10/",
pages = "103--112",
ISBN = "979-8-89176-370-8",
abstract = "In this paper, we investigated the task of hate-speech classification in the closely related Turkic language pair, Turkish-Azerbaijani. Transformer models can achieve strong hate-speech classification in Turkish, but their performance does not reliably transfer to closely related low-resource languages without careful evaluation. We study Turkish{--}Azerbaijani hate speech detection and introduce the first manually annotated Azerbaijani benchmark, comprising 1,112 YouTube comments from major news channels with severe class imbalance. We compare XLM-RoBERTa and a compact BERT-Tiny model against a TF{--}IDF + logistic regression baseline under monolingual training, zero-shot Turkish{\textrightarrow}Azerbaijani transfer, low-resource balanced subsampling, bilingual mixed fine-tuning, and translation-based augmentation using machine-translated Turkish data. XLM-R attains high macro-F1 in Turkish and achieves moderate zero-shot transfer to Azerbaijani, but native Azerbaijani training is fragile for the hate class. Mixed bilingual training improves robustness for both languages, whereas TF{--}IDF generalizes poorly to Azerbaijani."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alizada-dubossarsky-2026-benchmarking">
<titleInfo>
<title>Benchmarking Hate Speech Detection in Azerbaijani with Turkish Cross-Lingual Transfer and Transformer Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tural</namePart>
<namePart type="family">Alizada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haim</namePart>
<namePart type="family">Dubossarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>In this paper, we investigated the task of hate-speech classification in the closely related Turkic language pair, Turkish-Azerbaijani. Transformer models can achieve strong hate-speech classification in Turkish, but their performance does not reliably transfer to closely related low-resource languages without careful evaluation. We study Turkish–Azerbaijani hate speech detection and introduce the first manually annotated Azerbaijani benchmark, comprising 1,112 YouTube comments from major news channels with severe class imbalance. We compare XLM-RoBERTa and a compact BERT-Tiny model against a TF–IDF + logistic regression baseline under monolingual training, zero-shot Turkish→Azerbaijani transfer, low-resource balanced subsampling, bilingual mixed fine-tuning, and translation-based augmentation using machine-translated Turkish data. XLM-R attains high macro-F1 in Turkish and achieves moderate zero-shot transfer to Azerbaijani, but native Azerbaijani training is fragile for the hate class. Mixed bilingual training improves robustness for both languages, whereas TF–IDF generalizes poorly to Azerbaijani.</abstract>
<identifier type="citekey">alizada-dubossarsky-2026-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.10/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>103</start>
<end>112</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Hate Speech Detection in Azerbaijani with Turkish Cross-Lingual Transfer and Transformer Models
%A Alizada, Tural
%A Dubossarsky, Haim
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F alizada-dubossarsky-2026-benchmarking
%X In this paper, we investigated the task of hate-speech classification in the closely related Turkic language pair, Turkish-Azerbaijani. Transformer models can achieve strong hate-speech classification in Turkish, but their performance does not reliably transfer to closely related low-resource languages without careful evaluation. We study Turkish–Azerbaijani hate speech detection and introduce the first manually annotated Azerbaijani benchmark, comprising 1,112 YouTube comments from major news channels with severe class imbalance. We compare XLM-RoBERTa and a compact BERT-Tiny model against a TF–IDF + logistic regression baseline under monolingual training, zero-shot Turkish→Azerbaijani transfer, low-resource balanced subsampling, bilingual mixed fine-tuning, and translation-based augmentation using machine-translated Turkish data. XLM-R attains high macro-F1 in Turkish and achieves moderate zero-shot transfer to Azerbaijani, but native Azerbaijani training is fragile for the hate class. Mixed bilingual training improves robustness for both languages, whereas TF–IDF generalizes poorly to Azerbaijani.
%U https://aclanthology.org/2026.sigturk-1.10/
%P 103-112
Markdown (Informal)
[Benchmarking Hate Speech Detection in Azerbaijani with Turkish Cross-Lingual Transfer and Transformer Models](https://aclanthology.org/2026.sigturk-1.10/) (Alizada & Dubossarsky, SIGTURK 2026)
ACL