@inproceedings{muller-eberstein-etal-2025-decaf,
title = "{DECAF}: A Dynamically Extensible Corpus Analysis Framework",
author = {M{\"u}ller-Eberstein, Max and
Goot, Rob Van Der and
Rogers, Anna},
editor = "Mishra, Pushkar and
Muresan, Smaranda and
Yu, Tao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-demo.34/",
doi = "10.18653/v1/2025.acl-demo.34",
pages = "351--362",
ISBN = "979-8-89176-253-4",
abstract = "The study of generalization in Language Models (LMs) requires controlled experiments that can precisely measure complex linguistic variations between training and testing datasets. We introduce DECAF, a framework that enables the analysis and filtering of linguistically-annotated datasets down to the character level. Rather than creating new resources for each experiment, DECAF starts from datasets with existing linguistic annotations, and leverages them to analyze, filter, and generate highly controlled and reproducible experimental settings targeting specific research questions. We demonstrate DECAF{'}s functionality by adding 28 morphosyntactic annotation layers to the 115M-word BabyLM corpus and indexing the resulting 1.1B annotations to analyze its internal domain variance, and to create a controlled training data curriculum for a small-scale gender bias study. We release DECAF as an open-source Python library, along with the parsed and indexed version of BabyLM, as resources for future generalization research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="muller-eberstein-etal-2025-decaf">
<titleInfo>
<title>DECAF: A Dynamically Extensible Corpus Analysis Framework</title>
</titleInfo>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Müller-Eberstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="given">Van</namePart>
<namePart type="given">Der</namePart>
<namePart type="family">Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushkar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-253-4</identifier>
</relatedItem>
<abstract>The study of generalization in Language Models (LMs) requires controlled experiments that can precisely measure complex linguistic variations between training and testing datasets. We introduce DECAF, a framework that enables the analysis and filtering of linguistically-annotated datasets down to the character level. Rather than creating new resources for each experiment, DECAF starts from datasets with existing linguistic annotations, and leverages them to analyze, filter, and generate highly controlled and reproducible experimental settings targeting specific research questions. We demonstrate DECAF’s functionality by adding 28 morphosyntactic annotation layers to the 115M-word BabyLM corpus and indexing the resulting 1.1B annotations to analyze its internal domain variance, and to create a controlled training data curriculum for a small-scale gender bias study. We release DECAF as an open-source Python library, along with the parsed and indexed version of BabyLM, as resources for future generalization research.</abstract>
<identifier type="citekey">muller-eberstein-etal-2025-decaf</identifier>
<identifier type="doi">10.18653/v1/2025.acl-demo.34</identifier>
<location>
<url>https://aclanthology.org/2025.acl-demo.34/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>351</start>
<end>362</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DECAF: A Dynamically Extensible Corpus Analysis Framework
%A Müller-Eberstein, Max
%A Goot, Rob Van Der
%A Rogers, Anna
%Y Mishra, Pushkar
%Y Muresan, Smaranda
%Y Yu, Tao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-253-4
%F muller-eberstein-etal-2025-decaf
%X The study of generalization in Language Models (LMs) requires controlled experiments that can precisely measure complex linguistic variations between training and testing datasets. We introduce DECAF, a framework that enables the analysis and filtering of linguistically-annotated datasets down to the character level. Rather than creating new resources for each experiment, DECAF starts from datasets with existing linguistic annotations, and leverages them to analyze, filter, and generate highly controlled and reproducible experimental settings targeting specific research questions. We demonstrate DECAF’s functionality by adding 28 morphosyntactic annotation layers to the 115M-word BabyLM corpus and indexing the resulting 1.1B annotations to analyze its internal domain variance, and to create a controlled training data curriculum for a small-scale gender bias study. We release DECAF as an open-source Python library, along with the parsed and indexed version of BabyLM, as resources for future generalization research.
%R 10.18653/v1/2025.acl-demo.34
%U https://aclanthology.org/2025.acl-demo.34/
%U https://doi.org/10.18653/v1/2025.acl-demo.34
%P 351-362
Markdown (Informal)
[DECAF: A Dynamically Extensible Corpus Analysis Framework](https://aclanthology.org/2025.acl-demo.34/) (Müller-Eberstein et al., ACL 2025)
ACL
- Max Müller-Eberstein, Rob Van Der Goot, and Anna Rogers. 2025. DECAF: A Dynamically Extensible Corpus Analysis Framework. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), pages 351–362, Vienna, Austria. Association for Computational Linguistics.