@inproceedings{zhou-etal-2025-lm2protein,
title = "{LM}2{P}rotein: A Structure-to-Token Protein Large Language Model",
author = "Zhou, Chang and
Shan, Yuheng and
Chen, Pengan and
Shi, Xiangyu and
Wang, Zikang and
Li, Yanting and
Jiang, Jiyue",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.369/",
doi = "10.18653/v1/2025.findings-emnlp.369",
pages = "7023--7029",
ISBN = "979-8-89176-335-7",
abstract = "Proteins are critical for various molecular functions, relying on their precise tertiary structures. This structure-sequence relationship is complex and degenerate, meaning multiple sequences can fold into a similar structure. The challenges in protein prediction, design, and modification increase with sequence complexity, while research on RNA-protein interactions, especially RNA-binding proteins (RBPs), is gaining importance. Large-scale pre-trained language models (LLMs) have shown promising results in handling biological sequences by treating them as natural language, though integrating spatial structures remains complex due to the need for specialized visual and 3D modeling approaches. We introduce a method to integrate protein 3D structural data within a sequence processing framework, converting 3D coordinates into discrete structure tokens using a VQ-VAE-like network. This simplifies the handling of 3D data, avoiding complex pipelines and facilitating a unified sequence-to-sequence processing model. Our approach demonstrates strong performance across a range of tasks, achieving high sequence recovery in inverse folding and protein-conditioned RNA design. These outstanding results demonstrate significant potential for application in complex biological systems research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-lm2protein">
<titleInfo>
<title>LM2Protein: A Structure-to-Token Protein Large Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chang</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuheng</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangyu</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zikang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanting</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiyue</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Proteins are critical for various molecular functions, relying on their precise tertiary structures. This structure-sequence relationship is complex and degenerate, meaning multiple sequences can fold into a similar structure. The challenges in protein prediction, design, and modification increase with sequence complexity, while research on RNA-protein interactions, especially RNA-binding proteins (RBPs), is gaining importance. Large-scale pre-trained language models (LLMs) have shown promising results in handling biological sequences by treating them as natural language, though integrating spatial structures remains complex due to the need for specialized visual and 3D modeling approaches. We introduce a method to integrate protein 3D structural data within a sequence processing framework, converting 3D coordinates into discrete structure tokens using a VQ-VAE-like network. This simplifies the handling of 3D data, avoiding complex pipelines and facilitating a unified sequence-to-sequence processing model. Our approach demonstrates strong performance across a range of tasks, achieving high sequence recovery in inverse folding and protein-conditioned RNA design. These outstanding results demonstrate significant potential for application in complex biological systems research.</abstract>
<identifier type="citekey">zhou-etal-2025-lm2protein</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.369</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.369/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>7023</start>
<end>7029</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LM2Protein: A Structure-to-Token Protein Large Language Model
%A Zhou, Chang
%A Shan, Yuheng
%A Chen, Pengan
%A Shi, Xiangyu
%A Wang, Zikang
%A Li, Yanting
%A Jiang, Jiyue
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F zhou-etal-2025-lm2protein
%X Proteins are critical for various molecular functions, relying on their precise tertiary structures. This structure-sequence relationship is complex and degenerate, meaning multiple sequences can fold into a similar structure. The challenges in protein prediction, design, and modification increase with sequence complexity, while research on RNA-protein interactions, especially RNA-binding proteins (RBPs), is gaining importance. Large-scale pre-trained language models (LLMs) have shown promising results in handling biological sequences by treating them as natural language, though integrating spatial structures remains complex due to the need for specialized visual and 3D modeling approaches. We introduce a method to integrate protein 3D structural data within a sequence processing framework, converting 3D coordinates into discrete structure tokens using a VQ-VAE-like network. This simplifies the handling of 3D data, avoiding complex pipelines and facilitating a unified sequence-to-sequence processing model. Our approach demonstrates strong performance across a range of tasks, achieving high sequence recovery in inverse folding and protein-conditioned RNA design. These outstanding results demonstrate significant potential for application in complex biological systems research.
%R 10.18653/v1/2025.findings-emnlp.369
%U https://aclanthology.org/2025.findings-emnlp.369/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.369
%P 7023-7029
Markdown (Informal)
[LM2Protein: A Structure-to-Token Protein Large Language Model](https://aclanthology.org/2025.findings-emnlp.369/) (Zhou et al., Findings 2025)
ACL
- Chang Zhou, Yuheng Shan, Pengan Chen, Xiangyu Shi, Zikang Wang, Yanting Li, and Jiyue Jiang. 2025. LM2Protein: A Structure-to-Token Protein Large Language Model. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 7023–7029, Suzhou, China. Association for Computational Linguistics.