@inproceedings{feng-etal-2025-word,
title = "Word-level Cross-lingual Structure in Large Language Models",
author = "Feng, Zihao and
Cao, Hailong and
Xu, Wang and
Zhao, Tiejun",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.138/",
pages = "2026--2037",
abstract = "Large Language Models (LLMs) have demonstrated exceptional performance across a broad spectrum of cross-lingual Natural Language Processing (NLP) tasks. However, previous methods predominantly focus on leveraging parallel corpus to conduct instruction data for continuing pre-training or fine-tuning. They ignored the state of parallel data on the hidden layers of LLMs. In this paper, we demonstrate Word-level Cross-lingual Structure (WCS) of LLM which proves that the word-level embedding on the hidden layers are isomorphic between languages. We find that the hidden states of different languages' input on the LLMs hidden layers can be aligned with an orthogonal matrix on word-level. We prove this conclusion in both mathematical and downstream task ways on two representative LLM foundations, LLaMA2 and BLOOM. Besides, we propose an Isomorphism-based Data Augmentation (IDA) method to apply the WCS on a downstream cross-lingual task, Bilingual Lexicon Induction (BLI), in both supervised and unsupervised ways. The experiment shows the significant improvement of our proposed method over all the baselines, especially on low-resource languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="feng-etal-2025-word">
<titleInfo>
<title>Word-level Cross-lingual Structure in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihao</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hailong</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiejun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have demonstrated exceptional performance across a broad spectrum of cross-lingual Natural Language Processing (NLP) tasks. However, previous methods predominantly focus on leveraging parallel corpus to conduct instruction data for continuing pre-training or fine-tuning. They ignored the state of parallel data on the hidden layers of LLMs. In this paper, we demonstrate Word-level Cross-lingual Structure (WCS) of LLM which proves that the word-level embedding on the hidden layers are isomorphic between languages. We find that the hidden states of different languages’ input on the LLMs hidden layers can be aligned with an orthogonal matrix on word-level. We prove this conclusion in both mathematical and downstream task ways on two representative LLM foundations, LLaMA2 and BLOOM. Besides, we propose an Isomorphism-based Data Augmentation (IDA) method to apply the WCS on a downstream cross-lingual task, Bilingual Lexicon Induction (BLI), in both supervised and unsupervised ways. The experiment shows the significant improvement of our proposed method over all the baselines, especially on low-resource languages.</abstract>
<identifier type="citekey">feng-etal-2025-word</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.138/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>2026</start>
<end>2037</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word-level Cross-lingual Structure in Large Language Models
%A Feng, Zihao
%A Cao, Hailong
%A Xu, Wang
%A Zhao, Tiejun
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F feng-etal-2025-word
%X Large Language Models (LLMs) have demonstrated exceptional performance across a broad spectrum of cross-lingual Natural Language Processing (NLP) tasks. However, previous methods predominantly focus on leveraging parallel corpus to conduct instruction data for continuing pre-training or fine-tuning. They ignored the state of parallel data on the hidden layers of LLMs. In this paper, we demonstrate Word-level Cross-lingual Structure (WCS) of LLM which proves that the word-level embedding on the hidden layers are isomorphic between languages. We find that the hidden states of different languages’ input on the LLMs hidden layers can be aligned with an orthogonal matrix on word-level. We prove this conclusion in both mathematical and downstream task ways on two representative LLM foundations, LLaMA2 and BLOOM. Besides, we propose an Isomorphism-based Data Augmentation (IDA) method to apply the WCS on a downstream cross-lingual task, Bilingual Lexicon Induction (BLI), in both supervised and unsupervised ways. The experiment shows the significant improvement of our proposed method over all the baselines, especially on low-resource languages.
%U https://aclanthology.org/2025.coling-main.138/
%P 2026-2037
Markdown (Informal)
[Word-level Cross-lingual Structure in Large Language Models](https://aclanthology.org/2025.coling-main.138/) (Feng et al., COLING 2025)
ACL