@inproceedings{dementieva-etal-2024-multiparadetox,
title = "{M}ulti{P}ara{D}etox: Extending Text Detoxification with Parallel Data to New Languages",
author = "Dementieva, Daryna and
Babakov, Nikolay and
Panchenko, Alexander",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-short.12",
doi = "10.18653/v1/2024.naacl-short.12",
pages = "124--140",
abstract = "Text detoxification is a textual style transfer (TST) task where a text is paraphrased from a toxic surface form, e.g. featuring rude words, to the neutral register. Recently, text detoxification methods found their applications in various task such as detoxification of Large Language Models (LLMs) (Leong et al., 2023; He et al., 2024; Tang et al., 2023) and toxic speech combating in social networks (Deng et al., 2023; Mun et al., 2023; Agarwal et al., 2023). All these applications are extremely important to ensure safe communication in modern digital worlds. However, the previous approaches for parallel text detoxification corpora collection{---}ParaDetox (Logacheva et al., 2022) and APPADIA (Atwell et al., 2022){---}were explored only in monolingual setup. In this work, we aim to extend ParaDetox pipeline to multiple languages presenting MultiParaDetox to automate parallel detoxification corpus collection for potentially any language. Then, we experiment with different text detoxification models{---}from unsupervised baselines to LLMs and fine-tuned models on the presented parallel corpora{---}showing the great benefit of parallel corpus presence to obtain state-of-the-art text detoxification models for any language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dementieva-etal-2024-multiparadetox">
<titleInfo>
<title>MultiParaDetox: Extending Text Detoxification with Parallel Data to New Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Babakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Panchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text detoxification is a textual style transfer (TST) task where a text is paraphrased from a toxic surface form, e.g. featuring rude words, to the neutral register. Recently, text detoxification methods found their applications in various task such as detoxification of Large Language Models (LLMs) (Leong et al., 2023; He et al., 2024; Tang et al., 2023) and toxic speech combating in social networks (Deng et al., 2023; Mun et al., 2023; Agarwal et al., 2023). All these applications are extremely important to ensure safe communication in modern digital worlds. However, the previous approaches for parallel text detoxification corpora collection—ParaDetox (Logacheva et al., 2022) and APPADIA (Atwell et al., 2022)—were explored only in monolingual setup. In this work, we aim to extend ParaDetox pipeline to multiple languages presenting MultiParaDetox to automate parallel detoxification corpus collection for potentially any language. Then, we experiment with different text detoxification models—from unsupervised baselines to LLMs and fine-tuned models on the presented parallel corpora—showing the great benefit of parallel corpus presence to obtain state-of-the-art text detoxification models for any language.</abstract>
<identifier type="citekey">dementieva-etal-2024-multiparadetox</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-short.12</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-short.12</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>124</start>
<end>140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MultiParaDetox: Extending Text Detoxification with Parallel Data to New Languages
%A Dementieva, Daryna
%A Babakov, Nikolay
%A Panchenko, Alexander
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F dementieva-etal-2024-multiparadetox
%X Text detoxification is a textual style transfer (TST) task where a text is paraphrased from a toxic surface form, e.g. featuring rude words, to the neutral register. Recently, text detoxification methods found their applications in various task such as detoxification of Large Language Models (LLMs) (Leong et al., 2023; He et al., 2024; Tang et al., 2023) and toxic speech combating in social networks (Deng et al., 2023; Mun et al., 2023; Agarwal et al., 2023). All these applications are extremely important to ensure safe communication in modern digital worlds. However, the previous approaches for parallel text detoxification corpora collection—ParaDetox (Logacheva et al., 2022) and APPADIA (Atwell et al., 2022)—were explored only in monolingual setup. In this work, we aim to extend ParaDetox pipeline to multiple languages presenting MultiParaDetox to automate parallel detoxification corpus collection for potentially any language. Then, we experiment with different text detoxification models—from unsupervised baselines to LLMs and fine-tuned models on the presented parallel corpora—showing the great benefit of parallel corpus presence to obtain state-of-the-art text detoxification models for any language.
%R 10.18653/v1/2024.naacl-short.12
%U https://aclanthology.org/2024.naacl-short.12
%U https://doi.org/10.18653/v1/2024.naacl-short.12
%P 124-140
Markdown (Informal)
[MultiParaDetox: Extending Text Detoxification with Parallel Data to New Languages](https://aclanthology.org/2024.naacl-short.12) (Dementieva et al., NAACL 2024)
ACL