@inproceedings{liu-etal-2025-dasa,
title = "{DASA}-Trans-{STM}: Adaptive Efficient Transformer for Short Text Matching using Data Augmentation and Semantic Awareness",
author = "Liu, Jiguo and
Liu, Chao and
Li, Meimei and
Li, Nan and
Gao, Shihao and
Zhu, Dali",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.228/",
pages = "4594--4610",
ISBN = "979-8-89176-332-6",
abstract = "Rencent advancements in large language models (LLM) have shown impressive versatility across various tasks. Short text matching is one of the fundamental technologies in natural language processing. In previous studies, the common approach to applying them to Chinese is segmenting each sentence into words, and then taking these words as input. However, existing approaches have three limitations: 1) Some Chinese words are polysemous, and semantic information is not fully utilized. 2) Some models suffer potential issues caused by word segmentation and incorrect recognition of negative words affects the semantic understanding of the whole sentence. 3) Fuzzy negation words in ancient Chinese are difficult to recognize and match. In this work, we propose a novel adaptive Transformer for Chinese short text matching using Data Augmentation and Semantic Awareness (DASA), which can fully mine the information expressed in Chinese text to deal with word ambiguity. DASA is based on a Graph Attention Transformer Encoder that takes two word lattice graphs as input and integrates sense information from N-HowNet to moderate word ambiguity. Specially, we use an LLM to generate similar sentences for the optimal text representation. Experimental results show that the augmentation done using DASA can considerably boost the performance of our system and achieve significantly better results than previous state-of-the-art methods on four available datasets, namely MNS, LCQMC, AFQMC, and BQ."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-dasa">
<titleInfo>
<title>DASA-Trans-STM: Adaptive Efficient Transformer for Short Text Matching using Data Augmentation and Semantic Awareness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiguo</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meimei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shihao</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dali</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Rencent advancements in large language models (LLM) have shown impressive versatility across various tasks. Short text matching is one of the fundamental technologies in natural language processing. In previous studies, the common approach to applying them to Chinese is segmenting each sentence into words, and then taking these words as input. However, existing approaches have three limitations: 1) Some Chinese words are polysemous, and semantic information is not fully utilized. 2) Some models suffer potential issues caused by word segmentation and incorrect recognition of negative words affects the semantic understanding of the whole sentence. 3) Fuzzy negation words in ancient Chinese are difficult to recognize and match. In this work, we propose a novel adaptive Transformer for Chinese short text matching using Data Augmentation and Semantic Awareness (DASA), which can fully mine the information expressed in Chinese text to deal with word ambiguity. DASA is based on a Graph Attention Transformer Encoder that takes two word lattice graphs as input and integrates sense information from N-HowNet to moderate word ambiguity. Specially, we use an LLM to generate similar sentences for the optimal text representation. Experimental results show that the augmentation done using DASA can considerably boost the performance of our system and achieve significantly better results than previous state-of-the-art methods on four available datasets, namely MNS, LCQMC, AFQMC, and BQ.</abstract>
<identifier type="citekey">liu-etal-2025-dasa</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.228/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>4594</start>
<end>4610</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DASA-Trans-STM: Adaptive Efficient Transformer for Short Text Matching using Data Augmentation and Semantic Awareness
%A Liu, Jiguo
%A Liu, Chao
%A Li, Meimei
%A Li, Nan
%A Gao, Shihao
%A Zhu, Dali
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F liu-etal-2025-dasa
%X Rencent advancements in large language models (LLM) have shown impressive versatility across various tasks. Short text matching is one of the fundamental technologies in natural language processing. In previous studies, the common approach to applying them to Chinese is segmenting each sentence into words, and then taking these words as input. However, existing approaches have three limitations: 1) Some Chinese words are polysemous, and semantic information is not fully utilized. 2) Some models suffer potential issues caused by word segmentation and incorrect recognition of negative words affects the semantic understanding of the whole sentence. 3) Fuzzy negation words in ancient Chinese are difficult to recognize and match. In this work, we propose a novel adaptive Transformer for Chinese short text matching using Data Augmentation and Semantic Awareness (DASA), which can fully mine the information expressed in Chinese text to deal with word ambiguity. DASA is based on a Graph Attention Transformer Encoder that takes two word lattice graphs as input and integrates sense information from N-HowNet to moderate word ambiguity. Specially, we use an LLM to generate similar sentences for the optimal text representation. Experimental results show that the augmentation done using DASA can considerably boost the performance of our system and achieve significantly better results than previous state-of-the-art methods on four available datasets, namely MNS, LCQMC, AFQMC, and BQ.
%U https://aclanthology.org/2025.emnlp-main.228/
%P 4594-4610
Markdown (Informal)
[DASA-Trans-STM: Adaptive Efficient Transformer for Short Text Matching using Data Augmentation and Semantic Awareness](https://aclanthology.org/2025.emnlp-main.228/) (Liu et al., EMNLP 2025)
ACL