@inproceedings{ondov-etal-2026-coreelm,
title = "{C}ore{ELM}: An Open-Source Framework for Aligning Large Language Models to Embedding Spaces",
author = "Ondov, Brian and
Chang, Chia-Hsuan and
Zhou, Yujia and
Giuffr{\`e}, Mauro and
Xu, Hua",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.15/",
pages = "156--180",
ISBN = "979-8-89176-434-7",
abstract = "Text embeddings have become an essential part of a variety of language applications. However, methods for interpreting, exploring and reversing embedding spaces are limited, reducing transparency and precluding potentially valuable generative use cases. In this work, we develop an open-source, domain-agnostic framework for aligning Large Language Models to embedding spaces using the recently reported Embedding Language Model (ELM) method. We demonstrate our framework by training models to recover, summarize, and compare clinical trial abstracts from embeddings alone. In addition to inverting embeddings back to text more reliably than existing methods, our models can decode novel, interpolated embeddings into new clinical trial abstracts that human experts cannot distinguish from real ones. We further show that these generated abstracts are responsive to moving embeddings along concept vectors for age and sex of study subjects. Our public ELM implementation and experimental results will aid the alignment of Large Language Models to embedding spaces in the biomedical domain and beyond."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ondov-etal-2026-coreelm">
<titleInfo>
<title>CoreELM: An Open-Source Framework for Aligning Large Language Models to Embedding Spaces</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Ondov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chia-Hsuan</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujia</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mauro</namePart>
<namePart type="family">Giuffrè</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Text embeddings have become an essential part of a variety of language applications. However, methods for interpreting, exploring and reversing embedding spaces are limited, reducing transparency and precluding potentially valuable generative use cases. In this work, we develop an open-source, domain-agnostic framework for aligning Large Language Models to embedding spaces using the recently reported Embedding Language Model (ELM) method. We demonstrate our framework by training models to recover, summarize, and compare clinical trial abstracts from embeddings alone. In addition to inverting embeddings back to text more reliably than existing methods, our models can decode novel, interpolated embeddings into new clinical trial abstracts that human experts cannot distinguish from real ones. We further show that these generated abstracts are responsive to moving embeddings along concept vectors for age and sex of study subjects. Our public ELM implementation and experimental results will aid the alignment of Large Language Models to embedding spaces in the biomedical domain and beyond.</abstract>
<identifier type="citekey">ondov-etal-2026-coreelm</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.15/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>156</start>
<end>180</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CoreELM: An Open-Source Framework for Aligning Large Language Models to Embedding Spaces
%A Ondov, Brian
%A Chang, Chia-Hsuan
%A Zhou, Yujia
%A Giuffrè, Mauro
%A Xu, Hua
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F ondov-etal-2026-coreelm
%X Text embeddings have become an essential part of a variety of language applications. However, methods for interpreting, exploring and reversing embedding spaces are limited, reducing transparency and precluding potentially valuable generative use cases. In this work, we develop an open-source, domain-agnostic framework for aligning Large Language Models to embedding spaces using the recently reported Embedding Language Model (ELM) method. We demonstrate our framework by training models to recover, summarize, and compare clinical trial abstracts from embeddings alone. In addition to inverting embeddings back to text more reliably than existing methods, our models can decode novel, interpolated embeddings into new clinical trial abstracts that human experts cannot distinguish from real ones. We further show that these generated abstracts are responsive to moving embeddings along concept vectors for age and sex of study subjects. Our public ELM implementation and experimental results will aid the alignment of Large Language Models to embedding spaces in the biomedical domain and beyond.
%U https://aclanthology.org/2026.bionlp-1.15/
%P 156-180
Markdown (Informal)
[CoreELM: An Open-Source Framework for Aligning Large Language Models to Embedding Spaces](https://aclanthology.org/2026.bionlp-1.15/) (Ondov et al., BioNLP 2026)
ACL