@inproceedings{zhang-etal-2026-opine,
title = "{OPINE}: A Prior-calibrated Scoring Framework for {LLM}-based Multi-label Scientific Opinion Classification",
author = "Zhang, Mengting and
Pan, Gaofeng and
Zhang, Zhixiong and
Li, Yang and
Zhang, Guangyin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1617/",
pages = "32313--32333",
ISBN = "979-8-89176-395-1",
abstract = "Scientific opinion classification based on discourse functions provides a structured semantic basis for analytical tasks such as gap identification and hypothesis generation. However, this task is uniquely challenged by the multi-label nature of scientific expressions and AIMRaD structural constraints. Existing LLM-based methods typically rely on direct label generation, which obscures decision logic, or treat discourse information as passive context rather than a structural prior. We propose OPINE, a multi-stage framework that reformulates classification as a controllable *scoring-calibration-refinement* pipeline. By decoupling textual evidence from decision logic, OPINE generates independent label-wise affinity scores calibrated by AIMRaD priors. To resolve the multi-label challenge, we introduce a quantile-based decoding rule to naturally capture co-existing roles, alongside a pairwise refinement mechanism to mitigate confusion between similar categories. We contribute a new benchmark of 18 discourse functions across diverse sections. Experimental results show that OPINE generally outperforms strong baselines, reaching F1 scores of 63.20{\%}, 53.68{\%}, and 63.22{\%} under Micro, Macro, and Example settings, respectively. Our analysis reveals that integrating discourse structures as explicit priors is superior to conventional passive context integration, while pairwise refinement successfully mitigates confusion between functionally similar categories. The code and dataset are available at https://github.com/znoodle63/OPINE."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-opine">
<titleInfo>
<title>OPINE: A Prior-calibrated Scoring Framework for LLM-based Multi-label Scientific Opinion Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mengting</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaofeng</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhixiong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guangyin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Scientific opinion classification based on discourse functions provides a structured semantic basis for analytical tasks such as gap identification and hypothesis generation. However, this task is uniquely challenged by the multi-label nature of scientific expressions and AIMRaD structural constraints. Existing LLM-based methods typically rely on direct label generation, which obscures decision logic, or treat discourse information as passive context rather than a structural prior. We propose OPINE, a multi-stage framework that reformulates classification as a controllable *scoring-calibration-refinement* pipeline. By decoupling textual evidence from decision logic, OPINE generates independent label-wise affinity scores calibrated by AIMRaD priors. To resolve the multi-label challenge, we introduce a quantile-based decoding rule to naturally capture co-existing roles, alongside a pairwise refinement mechanism to mitigate confusion between similar categories. We contribute a new benchmark of 18 discourse functions across diverse sections. Experimental results show that OPINE generally outperforms strong baselines, reaching F1 scores of 63.20%, 53.68%, and 63.22% under Micro, Macro, and Example settings, respectively. Our analysis reveals that integrating discourse structures as explicit priors is superior to conventional passive context integration, while pairwise refinement successfully mitigates confusion between functionally similar categories. The code and dataset are available at https://github.com/znoodle63/OPINE.</abstract>
<identifier type="citekey">zhang-etal-2026-opine</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1617/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>32313</start>
<end>32333</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OPINE: A Prior-calibrated Scoring Framework for LLM-based Multi-label Scientific Opinion Classification
%A Zhang, Mengting
%A Pan, Gaofeng
%A Zhang, Zhixiong
%A Li, Yang
%A Zhang, Guangyin
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-opine
%X Scientific opinion classification based on discourse functions provides a structured semantic basis for analytical tasks such as gap identification and hypothesis generation. However, this task is uniquely challenged by the multi-label nature of scientific expressions and AIMRaD structural constraints. Existing LLM-based methods typically rely on direct label generation, which obscures decision logic, or treat discourse information as passive context rather than a structural prior. We propose OPINE, a multi-stage framework that reformulates classification as a controllable *scoring-calibration-refinement* pipeline. By decoupling textual evidence from decision logic, OPINE generates independent label-wise affinity scores calibrated by AIMRaD priors. To resolve the multi-label challenge, we introduce a quantile-based decoding rule to naturally capture co-existing roles, alongside a pairwise refinement mechanism to mitigate confusion between similar categories. We contribute a new benchmark of 18 discourse functions across diverse sections. Experimental results show that OPINE generally outperforms strong baselines, reaching F1 scores of 63.20%, 53.68%, and 63.22% under Micro, Macro, and Example settings, respectively. Our analysis reveals that integrating discourse structures as explicit priors is superior to conventional passive context integration, while pairwise refinement successfully mitigates confusion between functionally similar categories. The code and dataset are available at https://github.com/znoodle63/OPINE.
%U https://aclanthology.org/2026.findings-acl.1617/
%P 32313-32333
Markdown (Informal)
[OPINE: A Prior-calibrated Scoring Framework for LLM-based Multi-label Scientific Opinion Classification](https://aclanthology.org/2026.findings-acl.1617/) (Zhang et al., Findings 2026)
ACL