@inproceedings{zhong-etal-2026-multimodal,
title = "Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning",
author = "Zhong, Hanmeng and
Wu, Wentao and
Chen, Linqing and
Zhou, Peng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1489/",
pages = "29784--29796",
ISBN = "979-8-89176-395-1",
abstract = "Navigating biopharmaceutical intellectual property necessitates precisely associating visual chemical structures with their textual referents across lengthy documents. Despite its critical role in drug discovery, this multimodal coreference task remains underexplored. It presents unique challenges, including handling Markush structures and distinguishing the atom-level differences between adjacent structures. To bridge this gap, we define the multimodal Chemical Structure-Text coreference and introduce CheST, the first dataset explicitly designed for the task. Furthermore, to satisfy the strict logical consistency in the task, we propose RULER, a RULE-guided multimodal Reinforcement learning framework built upon an SFT cold start. RULER utilizes rule-driven reward functions operationalizing multidimensional consistencies, acting as a domain-specific ``verifier'' to obtain the correct domain knowledge. Experimental results demonstrate that RULER achieves a 40{\%} improvement over the strongest baseline{--}Gemini-2.5-Pro, demonstrating the superior efficacy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhong-etal-2026-multimodal">
<titleInfo>
<title>Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hanmeng</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wentao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linqing</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Navigating biopharmaceutical intellectual property necessitates precisely associating visual chemical structures with their textual referents across lengthy documents. Despite its critical role in drug discovery, this multimodal coreference task remains underexplored. It presents unique challenges, including handling Markush structures and distinguishing the atom-level differences between adjacent structures. To bridge this gap, we define the multimodal Chemical Structure-Text coreference and introduce CheST, the first dataset explicitly designed for the task. Furthermore, to satisfy the strict logical consistency in the task, we propose RULER, a RULE-guided multimodal Reinforcement learning framework built upon an SFT cold start. RULER utilizes rule-driven reward functions operationalizing multidimensional consistencies, acting as a domain-specific “verifier” to obtain the correct domain knowledge. Experimental results demonstrate that RULER achieves a 40% improvement over the strongest baseline–Gemini-2.5-Pro, demonstrating the superior efficacy.</abstract>
<identifier type="citekey">zhong-etal-2026-multimodal</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1489/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>29784</start>
<end>29796</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning
%A Zhong, Hanmeng
%A Wu, Wentao
%A Chen, Linqing
%A Zhou, Peng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhong-etal-2026-multimodal
%X Navigating biopharmaceutical intellectual property necessitates precisely associating visual chemical structures with their textual referents across lengthy documents. Despite its critical role in drug discovery, this multimodal coreference task remains underexplored. It presents unique challenges, including handling Markush structures and distinguishing the atom-level differences between adjacent structures. To bridge this gap, we define the multimodal Chemical Structure-Text coreference and introduce CheST, the first dataset explicitly designed for the task. Furthermore, to satisfy the strict logical consistency in the task, we propose RULER, a RULE-guided multimodal Reinforcement learning framework built upon an SFT cold start. RULER utilizes rule-driven reward functions operationalizing multidimensional consistencies, acting as a domain-specific “verifier” to obtain the correct domain knowledge. Experimental results demonstrate that RULER achieves a 40% improvement over the strongest baseline–Gemini-2.5-Pro, demonstrating the superior efficacy.
%U https://aclanthology.org/2026.findings-acl.1489/
%P 29784-29796
Markdown (Informal)
[Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning](https://aclanthology.org/2026.findings-acl.1489/) (Zhong et al., Findings 2026)
ACL