@inproceedings{alimova-etal-2026-tatoxa,
title = "The Tatoxa System for Text Detoxification in Low-Resource Languages: The Case of {T}atar",
author = "Alimova, Ilseyar and
Monogov, Bogdan and
Mazur, Artyom and
Antonov, Daniil and
Karimov, Vsevolod and
Egorov, Vitaliy and
Khakimov, Bulat and
Panchenko, Alexander",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.starsem-conference.17/",
pages = "264--274",
ISBN = "979-8-89176-413-2",
abstract = "Text detoxification, the automated detection and mitigation of abusive and harmful content, is essential for ensuring the safety of online communities and protecting users. However, low resource languages such as Tatar have received little research attention. In this paper we present Tatoxa, a novel state-of-the-art system for text detoxification in the Tatar language. Comparative experiments show that the proposed approach outperforms existing open source and proprietary commercial LLMs on key quality metrics. We also introduce a new dataset for text detoxification in Tatar, designed for fine tuning and evaluation in low resource settings. Finally, cross lingual transfer experiments indicate that transfer from other languages, including the culturally close Russian, performs significantly worse than training on native Tatar data even when a large Russian corpus is available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alimova-etal-2026-tatoxa">
<titleInfo>
<title>The Tatoxa System for Text Detoxification in Low-Resource Languages: The Case of Tatar</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilseyar</namePart>
<namePart type="family">Alimova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bogdan</namePart>
<namePart type="family">Monogov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artyom</namePart>
<namePart type="family">Mazur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniil</namePart>
<namePart type="family">Antonov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vsevolod</namePart>
<namePart type="family">Karimov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitaliy</namePart>
<namePart type="family">Egorov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bulat</namePart>
<namePart type="family">Khakimov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Panchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saif</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nedjma</namePart>
<namePart type="family">Ousidhoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-413-2</identifier>
</relatedItem>
<abstract>Text detoxification, the automated detection and mitigation of abusive and harmful content, is essential for ensuring the safety of online communities and protecting users. However, low resource languages such as Tatar have received little research attention. In this paper we present Tatoxa, a novel state-of-the-art system for text detoxification in the Tatar language. Comparative experiments show that the proposed approach outperforms existing open source and proprietary commercial LLMs on key quality metrics. We also introduce a new dataset for text detoxification in Tatar, designed for fine tuning and evaluation in low resource settings. Finally, cross lingual transfer experiments indicate that transfer from other languages, including the culturally close Russian, performs significantly worse than training on native Tatar data even when a large Russian corpus is available.</abstract>
<identifier type="citekey">alimova-etal-2026-tatoxa</identifier>
<location>
<url>https://aclanthology.org/2026.starsem-conference.17/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>264</start>
<end>274</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Tatoxa System for Text Detoxification in Low-Resource Languages: The Case of Tatar
%A Alimova, Ilseyar
%A Monogov, Bogdan
%A Mazur, Artyom
%A Antonov, Daniil
%A Karimov, Vsevolod
%A Egorov, Vitaliy
%A Khakimov, Bulat
%A Panchenko, Alexander
%Y Mohammad, Saif M.
%Y Ousidhoum, Nedjma
%S Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-413-2
%F alimova-etal-2026-tatoxa
%X Text detoxification, the automated detection and mitigation of abusive and harmful content, is essential for ensuring the safety of online communities and protecting users. However, low resource languages such as Tatar have received little research attention. In this paper we present Tatoxa, a novel state-of-the-art system for text detoxification in the Tatar language. Comparative experiments show that the proposed approach outperforms existing open source and proprietary commercial LLMs on key quality metrics. We also introduce a new dataset for text detoxification in Tatar, designed for fine tuning and evaluation in low resource settings. Finally, cross lingual transfer experiments indicate that transfer from other languages, including the culturally close Russian, performs significantly worse than training on native Tatar data even when a large Russian corpus is available.
%U https://aclanthology.org/2026.starsem-conference.17/
%P 264-274
Markdown (Informal)
[The Tatoxa System for Text Detoxification in Low-Resource Languages: The Case of Tatar](https://aclanthology.org/2026.starsem-conference.17/) (Alimova et al., *SEM 2026)
ACL
- Ilseyar Alimova, Bogdan Monogov, Artyom Mazur, Daniil Antonov, Vsevolod Karimov, Vitaliy Egorov, Bulat Khakimov, and Alexander Panchenko. 2026. The Tatoxa System for Text Detoxification in Low-Resource Languages: The Case of Tatar. In Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026), pages 264–274, San Diego, California, United States. Association for Computational Linguistics.