@inproceedings{jararweh-etal-2025-protein2text,
title = "{P}rotein2{T}ext: Resampling Mechanism to Translate Protein Sequences into Human-Interpretable Text",
author = "Jararweh, Ala and
Macaulay, Oladimeji and
Arredondo, David and
Hu, Yue and
Tafoya, Luis E and
Virupakshappa, Kushal and
Sahu, Avinash",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-industry.68/",
doi = "10.18653/v1/2025.naacl-industry.68",
pages = "918--937",
ISBN = "979-8-89176-194-0",
abstract = "Proteins play critical roles in biological systems, yet 99.7{\%} of over 227 million known protein sequences remain uncharacterized due to the limitations of experimental methods. To assist experimentalists in narrowing down hypotheses and accelerating protein characterization, we present Protein2Text, a multimodal large language model that interprets protein sequences and generates informative text to address open-ended questions about protein functions and attributes. By integrating a resampling mechanism within an adapted LLaVA framework, our model effectively maps protein sequences into a language-compatible space, enhancing its capability to handle diverse and complex queries. Trained on a newly curated dataset derived from PubMed articles and rigorously evaluated using four comprehensive benchmarks{---}including in-domain and cross-domain evaluations{---}Protein2Text outperforms several existing models in open-ended question-answering tasks. Our work also highlights the limitations of current evaluation metrics applied to template-based approaches, which may lead to misleading results, emphasizing the need for unbiased assessment methods. Our model weights, evaluation datasets, and evaluation scripts are publicly available at https://github.com/alaaj27/Protein2Text.git."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jararweh-etal-2025-protein2text">
<titleInfo>
<title>Protein2Text: Resampling Mechanism to Translate Protein Sequences into Human-Interpretable Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ala</namePart>
<namePart type="family">Jararweh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oladimeji</namePart>
<namePart type="family">Macaulay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Arredondo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Tafoya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kushal</namePart>
<namePart type="family">Virupakshappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avinash</namePart>
<namePart type="family">Sahu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weizhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Kachuee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue-Yong</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-194-0</identifier>
</relatedItem>
<abstract>Proteins play critical roles in biological systems, yet 99.7% of over 227 million known protein sequences remain uncharacterized due to the limitations of experimental methods. To assist experimentalists in narrowing down hypotheses and accelerating protein characterization, we present Protein2Text, a multimodal large language model that interprets protein sequences and generates informative text to address open-ended questions about protein functions and attributes. By integrating a resampling mechanism within an adapted LLaVA framework, our model effectively maps protein sequences into a language-compatible space, enhancing its capability to handle diverse and complex queries. Trained on a newly curated dataset derived from PubMed articles and rigorously evaluated using four comprehensive benchmarks—including in-domain and cross-domain evaluations—Protein2Text outperforms several existing models in open-ended question-answering tasks. Our work also highlights the limitations of current evaluation metrics applied to template-based approaches, which may lead to misleading results, emphasizing the need for unbiased assessment methods. Our model weights, evaluation datasets, and evaluation scripts are publicly available at https://github.com/alaaj27/Protein2Text.git.</abstract>
<identifier type="citekey">jararweh-etal-2025-protein2text</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-industry.68</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-industry.68/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>918</start>
<end>937</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Protein2Text: Resampling Mechanism to Translate Protein Sequences into Human-Interpretable Text
%A Jararweh, Ala
%A Macaulay, Oladimeji
%A Arredondo, David
%A Hu, Yue
%A Tafoya, Luis E.
%A Virupakshappa, Kushal
%A Sahu, Avinash
%Y Chen, Weizhu
%Y Yang, Yi
%Y Kachuee, Mohammad
%Y Fu, Xue-Yong
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-194-0
%F jararweh-etal-2025-protein2text
%X Proteins play critical roles in biological systems, yet 99.7% of over 227 million known protein sequences remain uncharacterized due to the limitations of experimental methods. To assist experimentalists in narrowing down hypotheses and accelerating protein characterization, we present Protein2Text, a multimodal large language model that interprets protein sequences and generates informative text to address open-ended questions about protein functions and attributes. By integrating a resampling mechanism within an adapted LLaVA framework, our model effectively maps protein sequences into a language-compatible space, enhancing its capability to handle diverse and complex queries. Trained on a newly curated dataset derived from PubMed articles and rigorously evaluated using four comprehensive benchmarks—including in-domain and cross-domain evaluations—Protein2Text outperforms several existing models in open-ended question-answering tasks. Our work also highlights the limitations of current evaluation metrics applied to template-based approaches, which may lead to misleading results, emphasizing the need for unbiased assessment methods. Our model weights, evaluation datasets, and evaluation scripts are publicly available at https://github.com/alaaj27/Protein2Text.git.
%R 10.18653/v1/2025.naacl-industry.68
%U https://aclanthology.org/2025.naacl-industry.68/
%U https://doi.org/10.18653/v1/2025.naacl-industry.68
%P 918-937
Markdown (Informal)
[Protein2Text: Resampling Mechanism to Translate Protein Sequences into Human-Interpretable Text](https://aclanthology.org/2025.naacl-industry.68/) (Jararweh et al., NAACL 2025)
ACL
- Ala Jararweh, Oladimeji Macaulay, David Arredondo, Yue Hu, Luis E Tafoya, Kushal Virupakshappa, and Avinash Sahu. 2025. Protein2Text: Resampling Mechanism to Translate Protein Sequences into Human-Interpretable Text. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track), pages 918–937, Albuquerque, New Mexico. Association for Computational Linguistics.