@inproceedings{warner-etal-2025-utilizing,
title = "Utilizing Semantic Textual Similarity for Clinical Survey Data Feature Selection",
author = "Warner, Benjamin C. and
Xu, Ziqi and
Haroutounian, Simon and
Kannampallil, Thomas and
Lu, Chenyang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.27/",
doi = "10.18653/v1/2025.findings-acl.27",
pages = "502--520",
ISBN = "979-8-89176-256-5",
abstract = "Surveys are widely used to collect patient data in healthcare, and there is significant clinical interest in predicting patient outcomes using survey data. However, surveys often include numerous features that lead to high-dimensional inputs for machine learning models. This paper exploits a unique source of information in surveys for feature selection. We observe that feature names (i.e., survey questions) are often semantically indicative of what features are most useful. Using language models, we leverage semantic textual similarity (STS) scores between features and targets to select features. The performance of STS scores in directly ranking features as well as in the minimal-redundancy-maximal-relevance (mRMR) algorithm is evaluated using survey data collected as part of a clinical study on persistent post-surgical pain (PPSP) as well as an accessible dataset collected through the NIH All of Us program. Our findings show that features selected with STS can result in higher performance models compared to traditional feature selection algorithms."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="warner-etal-2025-utilizing">
<titleInfo>
<title>Utilizing Semantic Textual Similarity for Clinical Survey Data Feature Selection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Warner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziqi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Haroutounian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Kannampallil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenyang</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Surveys are widely used to collect patient data in healthcare, and there is significant clinical interest in predicting patient outcomes using survey data. However, surveys often include numerous features that lead to high-dimensional inputs for machine learning models. This paper exploits a unique source of information in surveys for feature selection. We observe that feature names (i.e., survey questions) are often semantically indicative of what features are most useful. Using language models, we leverage semantic textual similarity (STS) scores between features and targets to select features. The performance of STS scores in directly ranking features as well as in the minimal-redundancy-maximal-relevance (mRMR) algorithm is evaluated using survey data collected as part of a clinical study on persistent post-surgical pain (PPSP) as well as an accessible dataset collected through the NIH All of Us program. Our findings show that features selected with STS can result in higher performance models compared to traditional feature selection algorithms.</abstract>
<identifier type="citekey">warner-etal-2025-utilizing</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.27</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.27/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>502</start>
<end>520</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Utilizing Semantic Textual Similarity for Clinical Survey Data Feature Selection
%A Warner, Benjamin C.
%A Xu, Ziqi
%A Haroutounian, Simon
%A Kannampallil, Thomas
%A Lu, Chenyang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F warner-etal-2025-utilizing
%X Surveys are widely used to collect patient data in healthcare, and there is significant clinical interest in predicting patient outcomes using survey data. However, surveys often include numerous features that lead to high-dimensional inputs for machine learning models. This paper exploits a unique source of information in surveys for feature selection. We observe that feature names (i.e., survey questions) are often semantically indicative of what features are most useful. Using language models, we leverage semantic textual similarity (STS) scores between features and targets to select features. The performance of STS scores in directly ranking features as well as in the minimal-redundancy-maximal-relevance (mRMR) algorithm is evaluated using survey data collected as part of a clinical study on persistent post-surgical pain (PPSP) as well as an accessible dataset collected through the NIH All of Us program. Our findings show that features selected with STS can result in higher performance models compared to traditional feature selection algorithms.
%R 10.18653/v1/2025.findings-acl.27
%U https://aclanthology.org/2025.findings-acl.27/
%U https://doi.org/10.18653/v1/2025.findings-acl.27
%P 502-520
Markdown (Informal)
[Utilizing Semantic Textual Similarity for Clinical Survey Data Feature Selection](https://aclanthology.org/2025.findings-acl.27/) (Warner et al., Findings 2025)
ACL