@inproceedings{dadason-loftsson-2024-unsupervised,
title = "Unsupervised Outlier Detection for Language-Independent Text Quality Filtering",
author = "Da{\dh}ason, J{\'o}n and
Loftsson, Hrafn",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.46",
pages = "383--393",
abstract = "Web-crawled corpora offer an abundant source of training data for language models. However, they are generally noisy and are typically filtered using heuristic rules or classifiers. These methods require careful tuning or labeling by fluent speakers. In this paper, we assess the effectiveness of commonly applied rules on TQ-IS, a manually labeled text quality dataset for Icelandic. Additionally, we advocate for the utilization of unsupervised clustering and outlier detection algorithms for filtering. These algorithms are language-independent, computationally efficient and do not require language expertise. Using grid search, we find the optimal configuration for every combination of rules, optimizing for F1 score on TQ-IS. For a rule-based approach, we discover that optimal results can be achieved with only a small subset of the full ruleset. Using five rules, we obtain an F1 score of 98.2{\%}. We then evaluate three unsupervised algorithms, i.e., Gaussian Mixture Models (GMMs), Isolation Forests and One-Class SVMs. Our findings reveal that unsupervised algorithms perform well on the TQ-IS dataset, with GMMs obtaining the best results, comparable to those obtained with the rule-based approach. Finally, we show that unsupervised methods appear to be equally suitable for languages other than Icelandic, including Estonian and Basque.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dadason-loftsson-2024-unsupervised">
<titleInfo>
<title>Unsupervised Outlier Detection for Language-Independent Text Quality Filtering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jón</namePart>
<namePart type="family">Da\dhason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Web-crawled corpora offer an abundant source of training data for language models. However, they are generally noisy and are typically filtered using heuristic rules or classifiers. These methods require careful tuning or labeling by fluent speakers. In this paper, we assess the effectiveness of commonly applied rules on TQ-IS, a manually labeled text quality dataset for Icelandic. Additionally, we advocate for the utilization of unsupervised clustering and outlier detection algorithms for filtering. These algorithms are language-independent, computationally efficient and do not require language expertise. Using grid search, we find the optimal configuration for every combination of rules, optimizing for F1 score on TQ-IS. For a rule-based approach, we discover that optimal results can be achieved with only a small subset of the full ruleset. Using five rules, we obtain an F1 score of 98.2%. We then evaluate three unsupervised algorithms, i.e., Gaussian Mixture Models (GMMs), Isolation Forests and One-Class SVMs. Our findings reveal that unsupervised algorithms perform well on the TQ-IS dataset, with GMMs obtaining the best results, comparable to those obtained with the rule-based approach. Finally, we show that unsupervised methods appear to be equally suitable for languages other than Icelandic, including Estonian and Basque.</abstract>
<identifier type="citekey">dadason-loftsson-2024-unsupervised</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.46</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>383</start>
<end>393</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Outlier Detection for Language-Independent Text Quality Filtering
%A Da\dhason, Jón
%A Loftsson, Hrafn
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F dadason-loftsson-2024-unsupervised
%X Web-crawled corpora offer an abundant source of training data for language models. However, they are generally noisy and are typically filtered using heuristic rules or classifiers. These methods require careful tuning or labeling by fluent speakers. In this paper, we assess the effectiveness of commonly applied rules on TQ-IS, a manually labeled text quality dataset for Icelandic. Additionally, we advocate for the utilization of unsupervised clustering and outlier detection algorithms for filtering. These algorithms are language-independent, computationally efficient and do not require language expertise. Using grid search, we find the optimal configuration for every combination of rules, optimizing for F1 score on TQ-IS. For a rule-based approach, we discover that optimal results can be achieved with only a small subset of the full ruleset. Using five rules, we obtain an F1 score of 98.2%. We then evaluate three unsupervised algorithms, i.e., Gaussian Mixture Models (GMMs), Isolation Forests and One-Class SVMs. Our findings reveal that unsupervised algorithms perform well on the TQ-IS dataset, with GMMs obtaining the best results, comparable to those obtained with the rule-based approach. Finally, we show that unsupervised methods appear to be equally suitable for languages other than Icelandic, including Estonian and Basque.
%U https://aclanthology.org/2024.sigul-1.46
%P 383-393
Markdown (Informal)
[Unsupervised Outlier Detection for Language-Independent Text Quality Filtering](https://aclanthology.org/2024.sigul-1.46) (Daðason & Loftsson, SIGUL-WS 2024)
ACL