@inproceedings{hayat-hasan-2025-context,
title = "A Context-Aware Approach for Enhancing Data Imputation with Pre-trained Language Models",
author = "Hayat, Ahatsham and
Hasan, Mohammad R.",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.380/",
pages = "5668--5685",
abstract = "This paper presents a novel approach named Contextually Relevant Imputation leveraging pre-trained Language Models (CRILM) for handling missing data in tabular datasets. Instead of relying on traditional numerical estimations, CRILM uses pre-trained language models (LMs) to create contextually relevant descriptors for missing values. This method aligns datasets with LMs' strengths, allowing large LMs to generate these descriptors and small LMs to be fine-tuned on the enriched datasets for enhanced downstream task performance. Our evaluations demonstrate CRILM`s superior performance and robustness across MCAR, MAR, and challenging MNAR scenarios, with up to a 10{\%} improvement over the best-performing baselines. By mitigating biases, particularly in MNAR settings, CRILM improves downstream task performance and offers a cost-effective solution for resource-constrained environments."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hayat-hasan-2025-context">
<titleInfo>
<title>A Context-Aware Approach for Enhancing Data Imputation with Pre-trained Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahatsham</namePart>
<namePart type="family">Hayat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a novel approach named Contextually Relevant Imputation leveraging pre-trained Language Models (CRILM) for handling missing data in tabular datasets. Instead of relying on traditional numerical estimations, CRILM uses pre-trained language models (LMs) to create contextually relevant descriptors for missing values. This method aligns datasets with LMs’ strengths, allowing large LMs to generate these descriptors and small LMs to be fine-tuned on the enriched datasets for enhanced downstream task performance. Our evaluations demonstrate CRILM‘s superior performance and robustness across MCAR, MAR, and challenging MNAR scenarios, with up to a 10% improvement over the best-performing baselines. By mitigating biases, particularly in MNAR settings, CRILM improves downstream task performance and offers a cost-effective solution for resource-constrained environments.</abstract>
<identifier type="citekey">hayat-hasan-2025-context</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.380/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5668</start>
<end>5685</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Context-Aware Approach for Enhancing Data Imputation with Pre-trained Language Models
%A Hayat, Ahatsham
%A Hasan, Mohammad R.
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F hayat-hasan-2025-context
%X This paper presents a novel approach named Contextually Relevant Imputation leveraging pre-trained Language Models (CRILM) for handling missing data in tabular datasets. Instead of relying on traditional numerical estimations, CRILM uses pre-trained language models (LMs) to create contextually relevant descriptors for missing values. This method aligns datasets with LMs’ strengths, allowing large LMs to generate these descriptors and small LMs to be fine-tuned on the enriched datasets for enhanced downstream task performance. Our evaluations demonstrate CRILM‘s superior performance and robustness across MCAR, MAR, and challenging MNAR scenarios, with up to a 10% improvement over the best-performing baselines. By mitigating biases, particularly in MNAR settings, CRILM improves downstream task performance and offers a cost-effective solution for resource-constrained environments.
%U https://aclanthology.org/2025.coling-main.380/
%P 5668-5685
Markdown (Informal)
[A Context-Aware Approach for Enhancing Data Imputation with Pre-trained Language Models](https://aclanthology.org/2025.coling-main.380/) (Hayat & Hasan, COLING 2025)
ACL