@inproceedings{chang-etal-2025-refining,
title = "Refining Dimensions for Improving Clustering-based Cross-lingual Topic Models",
author = "Chang, Chia-Hsuan and
Huang, Tien Yuan and
Tsai, Yi-Hang and
Chang, Chia-Ming and
Hwang, San-Yih",
editor = "Sharoff, Serge and
Terryn, Ayla Rigouts and
Zweigenbaum, Pierre and
Rapp, Reinhard",
booktitle = "Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bucc-1.6/",
pages = "46--56",
abstract = "Recent works in clustering-based topic models perform well in monolingual topic identification by introducing a pipeline to cluster the contextualized representations. However, the pipeline is suboptimal in identifying topics across languages due to the presence of language-dependent dimensions (LDDs) generated by multilingual language models. To address this issue, we introduce a novel, SVD-based dimension refinement component into the pipeline of the clustering-based topic model. This component effectively neutralizes the negative impact of LDDs, enabling the model to accurately identify topics across languages. Our experiments on three datasets demonstrate that the updated pipeline with the dimension refinement component generally outperforms other state-of-the-art cross-lingual topic models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chang-etal-2025-refining">
<titleInfo>
<title>Refining Dimensions for Improving Clustering-based Cross-lingual Topic Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chia-Hsuan</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tien</namePart>
<namePart type="given">Yuan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-Hang</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chia-Ming</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">San-Yih</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayla</namePart>
<namePart type="given">Rigouts</namePart>
<namePart type="family">Terryn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent works in clustering-based topic models perform well in monolingual topic identification by introducing a pipeline to cluster the contextualized representations. However, the pipeline is suboptimal in identifying topics across languages due to the presence of language-dependent dimensions (LDDs) generated by multilingual language models. To address this issue, we introduce a novel, SVD-based dimension refinement component into the pipeline of the clustering-based topic model. This component effectively neutralizes the negative impact of LDDs, enabling the model to accurately identify topics across languages. Our experiments on three datasets demonstrate that the updated pipeline with the dimension refinement component generally outperforms other state-of-the-art cross-lingual topic models.</abstract>
<identifier type="citekey">chang-etal-2025-refining</identifier>
<location>
<url>https://aclanthology.org/2025.bucc-1.6/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>46</start>
<end>56</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Refining Dimensions for Improving Clustering-based Cross-lingual Topic Models
%A Chang, Chia-Hsuan
%A Huang, Tien Yuan
%A Tsai, Yi-Hang
%A Chang, Chia-Ming
%A Hwang, San-Yih
%Y Sharoff, Serge
%Y Terryn, Ayla Rigouts
%Y Zweigenbaum, Pierre
%Y Rapp, Reinhard
%S Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F chang-etal-2025-refining
%X Recent works in clustering-based topic models perform well in monolingual topic identification by introducing a pipeline to cluster the contextualized representations. However, the pipeline is suboptimal in identifying topics across languages due to the presence of language-dependent dimensions (LDDs) generated by multilingual language models. To address this issue, we introduce a novel, SVD-based dimension refinement component into the pipeline of the clustering-based topic model. This component effectively neutralizes the negative impact of LDDs, enabling the model to accurately identify topics across languages. Our experiments on three datasets demonstrate that the updated pipeline with the dimension refinement component generally outperforms other state-of-the-art cross-lingual topic models.
%U https://aclanthology.org/2025.bucc-1.6/
%P 46-56
Markdown (Informal)
[Refining Dimensions for Improving Clustering-based Cross-lingual Topic Models](https://aclanthology.org/2025.bucc-1.6/) (Chang et al., BUCC 2025)
ACL