@inproceedings{zhang-etal-2025-tomato,
title = "Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts?",
author = "Zhang, Crystina and
Lu, Jing and
Tran, Vinh Q. and
Schuster, Tal and
Metzler, Donald and
Lin, Jimmy",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.98/",
doi = "10.18653/v1/2025.findings-naacl.98",
pages = "1821--1837",
ISBN = "979-8-89176-195-7",
abstract = "Human understanding of text depends on general semantic concepts of words rather than their superficial forms. To what extent does our human intuition transfer to language models? In this work, we study the degree to which current multilingual language models (mLMs) understand based on subword-level semantic concepts. To this end, we form ``semantic tokens'' by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on five heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections of the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we find that the zero-shot results with semantic tokens are on par with or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transfer."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-tomato">
<titleInfo>
<title>Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Crystina</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinh</namePart>
<namePart type="given">Q</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Schuster</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Donald</namePart>
<namePart type="family">Metzler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Human understanding of text depends on general semantic concepts of words rather than their superficial forms. To what extent does our human intuition transfer to language models? In this work, we study the degree to which current multilingual language models (mLMs) understand based on subword-level semantic concepts. To this end, we form “semantic tokens” by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on five heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections of the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we find that the zero-shot results with semantic tokens are on par with or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transfer.</abstract>
<identifier type="citekey">zhang-etal-2025-tomato</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.98</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.98/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1821</start>
<end>1837</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts?
%A Zhang, Crystina
%A Lu, Jing
%A Tran, Vinh Q.
%A Schuster, Tal
%A Metzler, Donald
%A Lin, Jimmy
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F zhang-etal-2025-tomato
%X Human understanding of text depends on general semantic concepts of words rather than their superficial forms. To what extent does our human intuition transfer to language models? In this work, we study the degree to which current multilingual language models (mLMs) understand based on subword-level semantic concepts. To this end, we form “semantic tokens” by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on five heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections of the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we find that the zero-shot results with semantic tokens are on par with or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transfer.
%R 10.18653/v1/2025.findings-naacl.98
%U https://aclanthology.org/2025.findings-naacl.98/
%U https://doi.org/10.18653/v1/2025.findings-naacl.98
%P 1821-1837
Markdown (Informal)
[Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts?](https://aclanthology.org/2025.findings-naacl.98/) (Zhang et al., Findings 2025)
ACL
- Crystina Zhang, Jing Lu, Vinh Q. Tran, Tal Schuster, Donald Metzler, and Jimmy Lin. 2025. Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts?. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 1821–1837, Albuquerque, New Mexico. Association for Computational Linguistics.