@inproceedings{dent-etal-2025-identifying,
title = "Identifying Rare Languages in {C}ommon {C}rawl Data is a Needles-in-a-Haystack Problem",
author = "Dent, Rasul and
Ortiz Suarez, Pedro and
Cl{\'e}rice, Thibault and
Sagot, Beno{\^i}t",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.77/",
doi = "10.18653/v1/2025.findings-emnlp.77",
pages = "1460--1473",
ISBN = "979-8-89176-335-7",
abstract = "Automatic language identification is frequently framed as a multi-class classification problem. However, when creating digital corpora for less commonly written languages, it may be more appropriate to consider it a data mining problem. For these varieties, one knows ahead of time that the vast majority of documents are of little interest. By minimizing resources spent on classifying such documents, we can create corpora covering previously overlooked languages faster than existing pipelines. To demonstrate the effectiveness of the targeted mining perspective, we introduce a new pipeline that can filter a single snapshot in two hours. We also provide web corpora for several French-based Creoles."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dent-etal-2025-identifying">
<titleInfo>
<title>Identifying Rare Languages in Common Crawl Data is a Needles-in-a-Haystack Problem</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rasul</namePart>
<namePart type="family">Dent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Ortiz Suarez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thibault</namePart>
<namePart type="family">Clérice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoît</namePart>
<namePart type="family">Sagot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Automatic language identification is frequently framed as a multi-class classification problem. However, when creating digital corpora for less commonly written languages, it may be more appropriate to consider it a data mining problem. For these varieties, one knows ahead of time that the vast majority of documents are of little interest. By minimizing resources spent on classifying such documents, we can create corpora covering previously overlooked languages faster than existing pipelines. To demonstrate the effectiveness of the targeted mining perspective, we introduce a new pipeline that can filter a single snapshot in two hours. We also provide web corpora for several French-based Creoles.</abstract>
<identifier type="citekey">dent-etal-2025-identifying</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.77</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.77/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1460</start>
<end>1473</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Rare Languages in Common Crawl Data is a Needles-in-a-Haystack Problem
%A Dent, Rasul
%A Ortiz Suarez, Pedro
%A Clérice, Thibault
%A Sagot, Benoît
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F dent-etal-2025-identifying
%X Automatic language identification is frequently framed as a multi-class classification problem. However, when creating digital corpora for less commonly written languages, it may be more appropriate to consider it a data mining problem. For these varieties, one knows ahead of time that the vast majority of documents are of little interest. By minimizing resources spent on classifying such documents, we can create corpora covering previously overlooked languages faster than existing pipelines. To demonstrate the effectiveness of the targeted mining perspective, we introduce a new pipeline that can filter a single snapshot in two hours. We also provide web corpora for several French-based Creoles.
%R 10.18653/v1/2025.findings-emnlp.77
%U https://aclanthology.org/2025.findings-emnlp.77/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.77
%P 1460-1473
Markdown (Informal)
[Identifying Rare Languages in Common Crawl Data is a Needles-in-a-Haystack Problem](https://aclanthology.org/2025.findings-emnlp.77/) (Dent et al., Findings 2025)
ACL