@inproceedings{yamauchi-aizawa-2026-semantics,
title = "From Semantics to Style: A Cross-Dataset Comparative Framework for Sentence Similarity Predictions",
author = "Yamauchi, Yusuke and
Aizawa, Akiko",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.95/",
pages = "1848--1877",
ISBN = "979-8-89176-386-9",
abstract = "While Semantic Textual Similarity (STS) task serves as a cornerstone embedding task in natural language processing, the definition of similarity is inherently ambiguous and dataset-specific. Comprehensive cross-dataset analysis remains scarce, leaving it uncertain whether language models perceive diverse semantic and stylistic nuances as humans do. To address this, we propose a comparative framework utilizing lightweight poolers on a frozen encoder to conduct a unified analysis across STS, Paraphrase Identification (PI), and Triplet datasets. Experimental results on 21 datasets indicate a high correlation of semantic concepts between STS and PI settings, while highlighting style as a distinct dimension necessitating explicit separation from semantics. Moreover, Procrustes, layer-wise and hierarchical clustering analyses elucidate the varying properties of these concepts and the structural organization of the embedding space. These insights imply that treating semantics and style as separate components in embedding models is crucial for enhancing both interpretability and practical utility."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamauchi-aizawa-2026-semantics">
<titleInfo>
<title>From Semantics to Style: A Cross-Dataset Comparative Framework for Sentence Similarity Predictions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Yamauchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Aizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>While Semantic Textual Similarity (STS) task serves as a cornerstone embedding task in natural language processing, the definition of similarity is inherently ambiguous and dataset-specific. Comprehensive cross-dataset analysis remains scarce, leaving it uncertain whether language models perceive diverse semantic and stylistic nuances as humans do. To address this, we propose a comparative framework utilizing lightweight poolers on a frozen encoder to conduct a unified analysis across STS, Paraphrase Identification (PI), and Triplet datasets. Experimental results on 21 datasets indicate a high correlation of semantic concepts between STS and PI settings, while highlighting style as a distinct dimension necessitating explicit separation from semantics. Moreover, Procrustes, layer-wise and hierarchical clustering analyses elucidate the varying properties of these concepts and the structural organization of the embedding space. These insights imply that treating semantics and style as separate components in embedding models is crucial for enhancing both interpretability and practical utility.</abstract>
<identifier type="citekey">yamauchi-aizawa-2026-semantics</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.95/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1848</start>
<end>1877</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Semantics to Style: A Cross-Dataset Comparative Framework for Sentence Similarity Predictions
%A Yamauchi, Yusuke
%A Aizawa, Akiko
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F yamauchi-aizawa-2026-semantics
%X While Semantic Textual Similarity (STS) task serves as a cornerstone embedding task in natural language processing, the definition of similarity is inherently ambiguous and dataset-specific. Comprehensive cross-dataset analysis remains scarce, leaving it uncertain whether language models perceive diverse semantic and stylistic nuances as humans do. To address this, we propose a comparative framework utilizing lightweight poolers on a frozen encoder to conduct a unified analysis across STS, Paraphrase Identification (PI), and Triplet datasets. Experimental results on 21 datasets indicate a high correlation of semantic concepts between STS and PI settings, while highlighting style as a distinct dimension necessitating explicit separation from semantics. Moreover, Procrustes, layer-wise and hierarchical clustering analyses elucidate the varying properties of these concepts and the structural organization of the embedding space. These insights imply that treating semantics and style as separate components in embedding models is crucial for enhancing both interpretability and practical utility.
%U https://aclanthology.org/2026.findings-eacl.95/
%P 1848-1877
Markdown (Informal)
[From Semantics to Style: A Cross-Dataset Comparative Framework for Sentence Similarity Predictions](https://aclanthology.org/2026.findings-eacl.95/) (Yamauchi & Aizawa, Findings 2026)
ACL