@inproceedings{ferreira-2025-building,
title = "Building a Compact Math Corpus",
author = "Ferreira, Andrea",
editor = "Abzianidze, Lasha and
de Paiva, Valeria",
booktitle = "Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)",
month = aug,
year = "2025",
address = "Bochum, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naloma-1.5/",
pages = "48--55",
ISBN = "979-8-89176-287-9",
abstract = "This paper introduces the Compact Math Corpus (CMC), a preliminary resource for natural language processing in the mathematics domain. We process three open-access undergraduate textbooks from distinct mathematical areas and annotate them in the CoNLL-U format using a lightweight pipeline based on the spaCy Small model. The structured output enables the extraction of syntactic bigrams and TF-IDF scores, supporting a syntactic-semantic analysis of mathematical sentences.From the annotated data, we construct a classification dataset comprising bigrams potentially representing mathematical concepts, along with representative example sentences. We combine CMC with the conversational corpus UD English EWT and train a logistic regression model with K-fold cross-validation, achieving a minimum macro-F1 score of 0.989. These results indicate the feasibility of automatic concept identification in mathematical texts.The study is designed for easy replication in low-resource settings and to promote sustainable research practices. Our approach offers a viable path to tasks such as parser adaptation, terminology extraction, multiword expression modeling, and improved analysis of mathematical language structures."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ferreira-2025-building">
<titleInfo>
<title>Building a Compact Math Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Ferreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lasha</namePart>
<namePart type="family">Abzianidze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="family">de Paiva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bochum, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-287-9</identifier>
</relatedItem>
<abstract>This paper introduces the Compact Math Corpus (CMC), a preliminary resource for natural language processing in the mathematics domain. We process three open-access undergraduate textbooks from distinct mathematical areas and annotate them in the CoNLL-U format using a lightweight pipeline based on the spaCy Small model. The structured output enables the extraction of syntactic bigrams and TF-IDF scores, supporting a syntactic-semantic analysis of mathematical sentences.From the annotated data, we construct a classification dataset comprising bigrams potentially representing mathematical concepts, along with representative example sentences. We combine CMC with the conversational corpus UD English EWT and train a logistic regression model with K-fold cross-validation, achieving a minimum macro-F1 score of 0.989. These results indicate the feasibility of automatic concept identification in mathematical texts.The study is designed for easy replication in low-resource settings and to promote sustainable research practices. Our approach offers a viable path to tasks such as parser adaptation, terminology extraction, multiword expression modeling, and improved analysis of mathematical language structures.</abstract>
<identifier type="citekey">ferreira-2025-building</identifier>
<location>
<url>https://aclanthology.org/2025.naloma-1.5/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>48</start>
<end>55</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building a Compact Math Corpus
%A Ferreira, Andrea
%Y Abzianidze, Lasha
%Y de Paiva, Valeria
%S Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Bochum, Germany
%@ 979-8-89176-287-9
%F ferreira-2025-building
%X This paper introduces the Compact Math Corpus (CMC), a preliminary resource for natural language processing in the mathematics domain. We process three open-access undergraduate textbooks from distinct mathematical areas and annotate them in the CoNLL-U format using a lightweight pipeline based on the spaCy Small model. The structured output enables the extraction of syntactic bigrams and TF-IDF scores, supporting a syntactic-semantic analysis of mathematical sentences.From the annotated data, we construct a classification dataset comprising bigrams potentially representing mathematical concepts, along with representative example sentences. We combine CMC with the conversational corpus UD English EWT and train a logistic regression model with K-fold cross-validation, achieving a minimum macro-F1 score of 0.989. These results indicate the feasibility of automatic concept identification in mathematical texts.The study is designed for easy replication in low-resource settings and to promote sustainable research practices. Our approach offers a viable path to tasks such as parser adaptation, terminology extraction, multiword expression modeling, and improved analysis of mathematical language structures.
%U https://aclanthology.org/2025.naloma-1.5/
%P 48-55
Markdown (Informal)
[Building a Compact Math Corpus](https://aclanthology.org/2025.naloma-1.5/) (Ferreira, NALOMA 2025)
ACL
- Andrea Ferreira. 2025. Building a Compact Math Corpus. In Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA), pages 48–55, Bochum, Germany. Association for Computational Linguistics.