@inproceedings{singha-etal-2025-tecofes,
title = "{T}e{C}o{F}e{S}: Text Column Featurization using Semantic Analysis",
author = "Singha, Ananya and
Singh, Mukul and
Tiwari, Ashish and
Gulwani, Sumit and
Le, Vu and
Parnin, Chris",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.392/",
doi = "10.18653/v1/2025.findings-naacl.392",
pages = "7055--7061",
ISBN = "979-8-89176-195-7",
abstract = "Extracting insights from text columns can bechallenging and time-intensive. Existing methods for topic modeling and feature extractionare based on syntactic features and often overlook the semantics. We introduce the semantictext column featurization problem, and presenta scalable approach for automatically solvingit. We extract a small sample smartly, use alarge language model (LLM) to label only thesample, and then lift the labeling to the wholecolumn using text embeddings. We evaluateour approach by turning existing text classification benchmarks into semantic categorization benchmarks. Our approach performs better than baselines and naive use of LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="singha-etal-2025-tecofes">
<titleInfo>
<title>TeCoFeS: Text Column Featurization using Semantic Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ananya</namePart>
<namePart type="family">Singha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mukul</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Tiwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumit</namePart>
<namePart type="family">Gulwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vu</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Parnin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Extracting insights from text columns can bechallenging and time-intensive. Existing methods for topic modeling and feature extractionare based on syntactic features and often overlook the semantics. We introduce the semantictext column featurization problem, and presenta scalable approach for automatically solvingit. We extract a small sample smartly, use alarge language model (LLM) to label only thesample, and then lift the labeling to the wholecolumn using text embeddings. We evaluateour approach by turning existing text classification benchmarks into semantic categorization benchmarks. Our approach performs better than baselines and naive use of LLMs.</abstract>
<identifier type="citekey">singha-etal-2025-tecofes</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.392</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.392/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7055</start>
<end>7061</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TeCoFeS: Text Column Featurization using Semantic Analysis
%A Singha, Ananya
%A Singh, Mukul
%A Tiwari, Ashish
%A Gulwani, Sumit
%A Le, Vu
%A Parnin, Chris
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F singha-etal-2025-tecofes
%X Extracting insights from text columns can bechallenging and time-intensive. Existing methods for topic modeling and feature extractionare based on syntactic features and often overlook the semantics. We introduce the semantictext column featurization problem, and presenta scalable approach for automatically solvingit. We extract a small sample smartly, use alarge language model (LLM) to label only thesample, and then lift the labeling to the wholecolumn using text embeddings. We evaluateour approach by turning existing text classification benchmarks into semantic categorization benchmarks. Our approach performs better than baselines and naive use of LLMs.
%R 10.18653/v1/2025.findings-naacl.392
%U https://aclanthology.org/2025.findings-naacl.392/
%U https://doi.org/10.18653/v1/2025.findings-naacl.392
%P 7055-7061
Markdown (Informal)
[TeCoFeS: Text Column Featurization using Semantic Analysis](https://aclanthology.org/2025.findings-naacl.392/) (Singha et al., Findings 2025)
ACL
- Ananya Singha, Mukul Singh, Ashish Tiwari, Sumit Gulwani, Vu Le, and Chris Parnin. 2025. TeCoFeS: Text Column Featurization using Semantic Analysis. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 7055–7061, Albuquerque, New Mexico. Association for Computational Linguistics.