@inproceedings{akbiyik-etal-2025-semantic,
title = "Semantic Outlier Removal with Embedding Models and {LLM}s",
author = "Akbiyik, Eren and
Almeida, Jo{\~a}o F. M. De and
Melis, Rik and
Sriram, Ritu and
Petrescu, Viviana and
Vilhj{\'a}lmsson, Vilhj{\'a}lmur",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.58/",
doi = "10.18653/v1/2025.acl-industry.58",
pages = "826--835",
ISBN = "979-8-89176-288-6",
abstract = "Modern text processing pipelines demand robust methods to remove extraneous content while preserving a document{'}s core message. Traditional approaches{---}such as HTML boilerplate extraction or keyword filters{---}often fail in multilingual settings and struggle with context-sensitive nuances, whereas Large Language Models (LLMs) offer improved quality at high computational cost. We introduce SORE (Semantic Outlier Removal), a cost-effective, transparent method that leverages multilingual sentence embeddings and approximate nearest-neighbor search to identify and excise unwanted text segments. By first identifying core content via metadata embedding and then flagging segments that either closely match predefined outlier groups or deviate significantly from the core, SORE achieves near-LLM extraction precision at a fraction of the cost. Experiments on HTML datasets demonstrate that SORE outperforms structural methods and yield high precision in diverse scenarios. Our system is currently deployed in production, processing millions of documents daily across multiple languages while maintaining both efficiency and accuracy. To facilitate reproducibility and further research, we will publicly release our implementation and evaluation datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="akbiyik-etal-2025-semantic">
<titleInfo>
<title>Semantic Outlier Removal with Embedding Models and LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eren</namePart>
<namePart type="family">Akbiyik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="given">F</namePart>
<namePart type="given">M</namePart>
<namePart type="given">De</namePart>
<namePart type="family">Almeida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rik</namePart>
<namePart type="family">Melis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritu</namePart>
<namePart type="family">Sriram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviana</namePart>
<namePart type="family">Petrescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vilhjálmur</namePart>
<namePart type="family">Vilhjálmsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>Modern text processing pipelines demand robust methods to remove extraneous content while preserving a document’s core message. Traditional approaches—such as HTML boilerplate extraction or keyword filters—often fail in multilingual settings and struggle with context-sensitive nuances, whereas Large Language Models (LLMs) offer improved quality at high computational cost. We introduce SORE (Semantic Outlier Removal), a cost-effective, transparent method that leverages multilingual sentence embeddings and approximate nearest-neighbor search to identify and excise unwanted text segments. By first identifying core content via metadata embedding and then flagging segments that either closely match predefined outlier groups or deviate significantly from the core, SORE achieves near-LLM extraction precision at a fraction of the cost. Experiments on HTML datasets demonstrate that SORE outperforms structural methods and yield high precision in diverse scenarios. Our system is currently deployed in production, processing millions of documents daily across multiple languages while maintaining both efficiency and accuracy. To facilitate reproducibility and further research, we will publicly release our implementation and evaluation datasets.</abstract>
<identifier type="citekey">akbiyik-etal-2025-semantic</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.58</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.58/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>826</start>
<end>835</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantic Outlier Removal with Embedding Models and LLMs
%A Akbiyik, Eren
%A Almeida, João F. M. De
%A Melis, Rik
%A Sriram, Ritu
%A Petrescu, Viviana
%A Vilhjálmsson, Vilhjálmur
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F akbiyik-etal-2025-semantic
%X Modern text processing pipelines demand robust methods to remove extraneous content while preserving a document’s core message. Traditional approaches—such as HTML boilerplate extraction or keyword filters—often fail in multilingual settings and struggle with context-sensitive nuances, whereas Large Language Models (LLMs) offer improved quality at high computational cost. We introduce SORE (Semantic Outlier Removal), a cost-effective, transparent method that leverages multilingual sentence embeddings and approximate nearest-neighbor search to identify and excise unwanted text segments. By first identifying core content via metadata embedding and then flagging segments that either closely match predefined outlier groups or deviate significantly from the core, SORE achieves near-LLM extraction precision at a fraction of the cost. Experiments on HTML datasets demonstrate that SORE outperforms structural methods and yield high precision in diverse scenarios. Our system is currently deployed in production, processing millions of documents daily across multiple languages while maintaining both efficiency and accuracy. To facilitate reproducibility and further research, we will publicly release our implementation and evaluation datasets.
%R 10.18653/v1/2025.acl-industry.58
%U https://aclanthology.org/2025.acl-industry.58/
%U https://doi.org/10.18653/v1/2025.acl-industry.58
%P 826-835
Markdown (Informal)
[Semantic Outlier Removal with Embedding Models and LLMs](https://aclanthology.org/2025.acl-industry.58/) (Akbiyik et al., ACL 2025)
ACL
- Eren Akbiyik, João F. M. De Almeida, Rik Melis, Ritu Sriram, Viviana Petrescu, and Vilhjálmur Vilhjálmsson. 2025. Semantic Outlier Removal with Embedding Models and LLMs. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track), pages 826–835, Vienna, Austria. Association for Computational Linguistics.