@inproceedings{ha-etal-2025-mv,
title = "{MV}-{CLAM}: Multi-View Molecular Interpretation with Cross-Modal Projection via Language Model",
author = "Ha, Sumin and
Kim, Jun Hyeong and
Piao, Yinhua and
Cho, Changyun and
Kim, Sun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1174/",
doi = "10.18653/v1/2025.findings-emnlp.1174",
pages = "21528--21549",
ISBN = "979-8-89176-335-7",
abstract = {Deciphering molecular meaning in chemistry and biomedicine depends on context {---} a capability that large language models (LLMs) can enhance by aligning molecular structures with language. However, existing molecule-text models ignore complementary information in different molecular views and rely on single-view representations, limiting molecule structural understanding. Moreover, na{\"i}ve multi-view alignment strategies face two challenges: (1) the aligned spaces differ across views due to inconsistent molecule-text mappings, and (2) existing loss objectives fail to preserve complementary information necessary for finegrained alignment. To enhance LLM{'}s ability to understand molecular structure, we propose MV-CLAM, a novel framework that aligns multi-view molecular representations into a unified textual space using a multi-querying transformer (MQ-Former). Our approach ensures cross-view consistency while the proposed token-level contrastive loss preserves diverse molecular features across textual queries. MV-CLAM enhances molecular reasoning, improving retrieval and captioning accuracy. The source code of MV-CLAM is available in https://github.com/sumin124/mv-clam.}
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ha-etal-2025-mv">
<titleInfo>
<title>MV-CLAM: Multi-View Molecular Interpretation with Cross-Modal Projection via Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sumin</namePart>
<namePart type="family">Ha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="given">Hyeong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinhua</namePart>
<namePart type="family">Piao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changyun</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Deciphering molecular meaning in chemistry and biomedicine depends on context — a capability that large language models (LLMs) can enhance by aligning molecular structures with language. However, existing molecule-text models ignore complementary information in different molecular views and rely on single-view representations, limiting molecule structural understanding. Moreover, naïve multi-view alignment strategies face two challenges: (1) the aligned spaces differ across views due to inconsistent molecule-text mappings, and (2) existing loss objectives fail to preserve complementary information necessary for finegrained alignment. To enhance LLM’s ability to understand molecular structure, we propose MV-CLAM, a novel framework that aligns multi-view molecular representations into a unified textual space using a multi-querying transformer (MQ-Former). Our approach ensures cross-view consistency while the proposed token-level contrastive loss preserves diverse molecular features across textual queries. MV-CLAM enhances molecular reasoning, improving retrieval and captioning accuracy. The source code of MV-CLAM is available in https://github.com/sumin124/mv-clam.</abstract>
<identifier type="citekey">ha-etal-2025-mv</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1174</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1174/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>21528</start>
<end>21549</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MV-CLAM: Multi-View Molecular Interpretation with Cross-Modal Projection via Language Model
%A Ha, Sumin
%A Kim, Jun Hyeong
%A Piao, Yinhua
%A Cho, Changyun
%A Kim, Sun
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F ha-etal-2025-mv
%X Deciphering molecular meaning in chemistry and biomedicine depends on context — a capability that large language models (LLMs) can enhance by aligning molecular structures with language. However, existing molecule-text models ignore complementary information in different molecular views and rely on single-view representations, limiting molecule structural understanding. Moreover, naïve multi-view alignment strategies face two challenges: (1) the aligned spaces differ across views due to inconsistent molecule-text mappings, and (2) existing loss objectives fail to preserve complementary information necessary for finegrained alignment. To enhance LLM’s ability to understand molecular structure, we propose MV-CLAM, a novel framework that aligns multi-view molecular representations into a unified textual space using a multi-querying transformer (MQ-Former). Our approach ensures cross-view consistency while the proposed token-level contrastive loss preserves diverse molecular features across textual queries. MV-CLAM enhances molecular reasoning, improving retrieval and captioning accuracy. The source code of MV-CLAM is available in https://github.com/sumin124/mv-clam.
%R 10.18653/v1/2025.findings-emnlp.1174
%U https://aclanthology.org/2025.findings-emnlp.1174/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1174
%P 21528-21549
Markdown (Informal)
[MV-CLAM: Multi-View Molecular Interpretation with Cross-Modal Projection via Language Model](https://aclanthology.org/2025.findings-emnlp.1174/) (Ha et al., Findings 2025)
ACL