@inproceedings{kuznetsov-etal-2025-feature,
title = "Feature-Level Insights into Artificial Text Detection with Sparse Autoencoders",
author = "Kuznetsov, Kristian and
Kushnareva, Laida and
Razzhigaev, Anton and
Druzhinina, Polina and
Voznyuk, Anastasia and
Piontkovskaya, Irina and
Burnaev, Evgeny and
Barannikov, Serguei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1321/",
doi = "10.18653/v1/2025.findings-acl.1321",
pages = "25727--25748",
ISBN = "979-8-89176-256-5",
abstract = "Artificial Text Detection (ATD) is becoming increasingly important with the rise of advanced Large Language Models (LLMs). Despite numerous efforts, no single algorithm performs consistently well across different types of unseen text or guarantees effective generalization to new LLMs. Interpretability plays a crucial role in achieving this goal. In this study, we enhance ATD interpretability by using Sparse Autoencoders (SAE) to extract features from Gemma-2-2B{'}s residual stream. We identify both interpretable and efficient features, analyzing their semantics and relevance through domain- and model-specific statistics, a steering approach, and manual or LLM-based interpretation of obtained features. Our methods offer valuable insights into how texts from various models differ from human-written content. We show that modern LLMs have a distinct writing style, especially in information-dense domains, even though they can produce human-like outputs with personalized prompts. The code for this paper is available at https://github.com/pyashy/SAE{\_}ATD."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuznetsov-etal-2025-feature">
<titleInfo>
<title>Feature-Level Insights into Artificial Text Detection with Sparse Autoencoders</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="family">Kuznetsov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laida</namePart>
<namePart type="family">Kushnareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Razzhigaev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Polina</namePart>
<namePart type="family">Druzhinina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Voznyuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Piontkovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evgeny</namePart>
<namePart type="family">Burnaev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serguei</namePart>
<namePart type="family">Barannikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Artificial Text Detection (ATD) is becoming increasingly important with the rise of advanced Large Language Models (LLMs). Despite numerous efforts, no single algorithm performs consistently well across different types of unseen text or guarantees effective generalization to new LLMs. Interpretability plays a crucial role in achieving this goal. In this study, we enhance ATD interpretability by using Sparse Autoencoders (SAE) to extract features from Gemma-2-2B’s residual stream. We identify both interpretable and efficient features, analyzing their semantics and relevance through domain- and model-specific statistics, a steering approach, and manual or LLM-based interpretation of obtained features. Our methods offer valuable insights into how texts from various models differ from human-written content. We show that modern LLMs have a distinct writing style, especially in information-dense domains, even though they can produce human-like outputs with personalized prompts. The code for this paper is available at https://github.com/pyashy/SAE_ATD.</abstract>
<identifier type="citekey">kuznetsov-etal-2025-feature</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1321</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1321/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25727</start>
<end>25748</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Feature-Level Insights into Artificial Text Detection with Sparse Autoencoders
%A Kuznetsov, Kristian
%A Kushnareva, Laida
%A Razzhigaev, Anton
%A Druzhinina, Polina
%A Voznyuk, Anastasia
%A Piontkovskaya, Irina
%A Burnaev, Evgeny
%A Barannikov, Serguei
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F kuznetsov-etal-2025-feature
%X Artificial Text Detection (ATD) is becoming increasingly important with the rise of advanced Large Language Models (LLMs). Despite numerous efforts, no single algorithm performs consistently well across different types of unseen text or guarantees effective generalization to new LLMs. Interpretability plays a crucial role in achieving this goal. In this study, we enhance ATD interpretability by using Sparse Autoencoders (SAE) to extract features from Gemma-2-2B’s residual stream. We identify both interpretable and efficient features, analyzing their semantics and relevance through domain- and model-specific statistics, a steering approach, and manual or LLM-based interpretation of obtained features. Our methods offer valuable insights into how texts from various models differ from human-written content. We show that modern LLMs have a distinct writing style, especially in information-dense domains, even though they can produce human-like outputs with personalized prompts. The code for this paper is available at https://github.com/pyashy/SAE_ATD.
%R 10.18653/v1/2025.findings-acl.1321
%U https://aclanthology.org/2025.findings-acl.1321/
%U https://doi.org/10.18653/v1/2025.findings-acl.1321
%P 25727-25748
Markdown (Informal)
[Feature-Level Insights into Artificial Text Detection with Sparse Autoencoders](https://aclanthology.org/2025.findings-acl.1321/) (Kuznetsov et al., Findings 2025)
ACL
- Kristian Kuznetsov, Laida Kushnareva, Anton Razzhigaev, Polina Druzhinina, Anastasia Voznyuk, Irina Piontkovskaya, Evgeny Burnaev, and Serguei Barannikov. 2025. Feature-Level Insights into Artificial Text Detection with Sparse Autoencoders. In Findings of the Association for Computational Linguistics: ACL 2025, pages 25727–25748, Vienna, Austria. Association for Computational Linguistics.