@inproceedings{kahaer-etal-2026-synergizing,
title = "Synergizing Semantic Anchors and Ordinal Smoothed Cross-Entropy for Speech Fluency Classification",
author = "Kahaer, Mulati and
Ruzmamat, Sirajahmat and
Pang, XuDong and
Maimaitituerxun, Subinuer and
Kadeer, Zaokere and
Reheman, Abudurexiti and
Lu, Wenwen and
Zheng, Panpan and
Wumaier, Aishan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1551/",
doi = "10.18653/v1/2026.findings-acl.1551",
pages = "31018--31029",
ISBN = "979-8-89176-395-1",
abstract = "Speech fluency is a core indicator of second language proficiency and a critical component of Computer-Assisted Pronunciation Training (CAPT) systems. Accurate assessment requires models to perceive both macroscopic speech flow trends and microscopic local anomalies. However, existing methods struggle to bridge the semantic gap between static expert priors and dynamic temporal representations, while often overlooking the inherent ordinal nature of fluency scores. To address these challenges, we first construct a set of expert features targeting fluency disruptions and rhythmic regularity to provide explicit linguistic priors. Building on this, we propose the Multimodal Multi-Stream Fusion Classification (MMSFC) network. It employs a Mutual Cross-Attention (MCA) mechanism that leverages these expert features as ``semantic anchors'' to actively guide Whisper{'}s temporal representations and integrate decoder contexts, achieving deep interaction between global priors and local dynamics. Furthermore, we propose the Ordinal Smoothed Cross-Entropy (OSCE) loss. By constructing distance-aware soft target distributions coupled with confidence-adaptive smoothing and boundary enhancement, OSCE explicitly models ordinal relationships to resolve boundary ambiguity. Experiments on SpeechOcean762 show MMSFC achieves 83.40{\%} accuracy, significantly outperforming strong baselines. Notably, OSCE also demonstrates superior generalization potential in cross-domain CV and NLP tasks. Our code is available at \url{https://github.com/speech26ai/MMSFCCode}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kahaer-etal-2026-synergizing">
<titleInfo>
<title>Synergizing Semantic Anchors and Ordinal Smoothed Cross-Entropy for Speech Fluency Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mulati</namePart>
<namePart type="family">Kahaer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sirajahmat</namePart>
<namePart type="family">Ruzmamat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">XuDong</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subinuer</namePart>
<namePart type="family">Maimaitituerxun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaokere</namePart>
<namePart type="family">Kadeer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abudurexiti</namePart>
<namePart type="family">Reheman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenwen</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Panpan</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aishan</namePart>
<namePart type="family">Wumaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Speech fluency is a core indicator of second language proficiency and a critical component of Computer-Assisted Pronunciation Training (CAPT) systems. Accurate assessment requires models to perceive both macroscopic speech flow trends and microscopic local anomalies. However, existing methods struggle to bridge the semantic gap between static expert priors and dynamic temporal representations, while often overlooking the inherent ordinal nature of fluency scores. To address these challenges, we first construct a set of expert features targeting fluency disruptions and rhythmic regularity to provide explicit linguistic priors. Building on this, we propose the Multimodal Multi-Stream Fusion Classification (MMSFC) network. It employs a Mutual Cross-Attention (MCA) mechanism that leverages these expert features as “semantic anchors” to actively guide Whisper’s temporal representations and integrate decoder contexts, achieving deep interaction between global priors and local dynamics. Furthermore, we propose the Ordinal Smoothed Cross-Entropy (OSCE) loss. By constructing distance-aware soft target distributions coupled with confidence-adaptive smoothing and boundary enhancement, OSCE explicitly models ordinal relationships to resolve boundary ambiguity. Experiments on SpeechOcean762 show MMSFC achieves 83.40% accuracy, significantly outperforming strong baselines. Notably, OSCE also demonstrates superior generalization potential in cross-domain CV and NLP tasks. Our code is available at https://github.com/speech26ai/MMSFCCode.</abstract>
<identifier type="citekey">kahaer-etal-2026-synergizing</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.1551</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1551/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>31018</start>
<end>31029</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synergizing Semantic Anchors and Ordinal Smoothed Cross-Entropy for Speech Fluency Classification
%A Kahaer, Mulati
%A Ruzmamat, Sirajahmat
%A Pang, XuDong
%A Maimaitituerxun, Subinuer
%A Kadeer, Zaokere
%A Reheman, Abudurexiti
%A Lu, Wenwen
%A Zheng, Panpan
%A Wumaier, Aishan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F kahaer-etal-2026-synergizing
%X Speech fluency is a core indicator of second language proficiency and a critical component of Computer-Assisted Pronunciation Training (CAPT) systems. Accurate assessment requires models to perceive both macroscopic speech flow trends and microscopic local anomalies. However, existing methods struggle to bridge the semantic gap between static expert priors and dynamic temporal representations, while often overlooking the inherent ordinal nature of fluency scores. To address these challenges, we first construct a set of expert features targeting fluency disruptions and rhythmic regularity to provide explicit linguistic priors. Building on this, we propose the Multimodal Multi-Stream Fusion Classification (MMSFC) network. It employs a Mutual Cross-Attention (MCA) mechanism that leverages these expert features as “semantic anchors” to actively guide Whisper’s temporal representations and integrate decoder contexts, achieving deep interaction between global priors and local dynamics. Furthermore, we propose the Ordinal Smoothed Cross-Entropy (OSCE) loss. By constructing distance-aware soft target distributions coupled with confidence-adaptive smoothing and boundary enhancement, OSCE explicitly models ordinal relationships to resolve boundary ambiguity. Experiments on SpeechOcean762 show MMSFC achieves 83.40% accuracy, significantly outperforming strong baselines. Notably, OSCE also demonstrates superior generalization potential in cross-domain CV and NLP tasks. Our code is available at https://github.com/speech26ai/MMSFCCode.
%R 10.18653/v1/2026.findings-acl.1551
%U https://aclanthology.org/2026.findings-acl.1551/
%U https://doi.org/10.18653/v1/2026.findings-acl.1551
%P 31018-31029
Markdown (Informal)
[Synergizing Semantic Anchors and Ordinal Smoothed Cross-Entropy for Speech Fluency Classification](https://aclanthology.org/2026.findings-acl.1551/) (Kahaer et al., Findings 2026)
ACL
- Mulati Kahaer, Sirajahmat Ruzmamat, XuDong Pang, Subinuer Maimaitituerxun, Zaokere Kadeer, Abudurexiti Reheman, Wenwen Lu, Panpan Zheng, and Aishan Wumaier. 2026. Synergizing Semantic Anchors and Ordinal Smoothed Cross-Entropy for Speech Fluency Classification. In Findings of the Association for Computational Linguistics: ACL 2026, pages 31018–31029, San Diego, California, United States. Association for Computational Linguistics.