@inproceedings{liu-lareau-2025-disentangling,
title = "Disentangling lexical and grammatical information in word embeddings",
author = "Liu, Li and
Lareau, Fran{\c{c}}ois",
editor = "Evang, Kilian and
Kallmeyer, Laura and
Pogodalla, Sylvain",
booktitle = "Proceedings of the 16th International Conference on Computational Semantics",
month = sep,
year = "2025",
address = {D{\"u}sseldorf, Germany},
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwcs-main.28/",
pages = "321--330",
ISBN = "979-8-89176-316-6",
abstract = "To enable finer-grained linguistic analysis, we propose a method for the separation of lexical and grammatical information within contextualized word embeddings. Using CamemBERT embeddings for French, we apply our method to 14,472 inflected word forms extracted from the Lexical Network of French ( LN-fr ), covering 1,468 nouns, 202 adjectives and 299 verbs inflected via 14 distinct grammatical feature values. Our iterative distillation alternates two steps until convergence: (i) estimating lexical or grammatical vectors by averaging the embeddings of words that share the same lexeme or grammatical feature value, and (ii) isolating the complementary component of each word embedding by subtracting the estimated vector. To assess the quality of the decomposition, we measure whether the resulting lexical and grammatical vectors form more compact clusters within their respective groups and whether their sum better reconstructs the original word embeddings. All evaluations rely on L2 distance. The observed improvements in both clustering and reconstruction accuracy demonstrate the effectiveness of our approach."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-lareau-2025-disentangling">
<titleInfo>
<title>Disentangling lexical and grammatical information in word embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Lareau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Conference on Computational Semantics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kilian</namePart>
<namePart type="family">Evang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Kallmeyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sylvain</namePart>
<namePart type="family">Pogodalla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Düsseldorf, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-316-6</identifier>
</relatedItem>
<abstract>To enable finer-grained linguistic analysis, we propose a method for the separation of lexical and grammatical information within contextualized word embeddings. Using CamemBERT embeddings for French, we apply our method to 14,472 inflected word forms extracted from the Lexical Network of French ( LN-fr ), covering 1,468 nouns, 202 adjectives and 299 verbs inflected via 14 distinct grammatical feature values. Our iterative distillation alternates two steps until convergence: (i) estimating lexical or grammatical vectors by averaging the embeddings of words that share the same lexeme or grammatical feature value, and (ii) isolating the complementary component of each word embedding by subtracting the estimated vector. To assess the quality of the decomposition, we measure whether the resulting lexical and grammatical vectors form more compact clusters within their respective groups and whether their sum better reconstructs the original word embeddings. All evaluations rely on L2 distance. The observed improvements in both clustering and reconstruction accuracy demonstrate the effectiveness of our approach.</abstract>
<identifier type="citekey">liu-lareau-2025-disentangling</identifier>
<location>
<url>https://aclanthology.org/2025.iwcs-main.28/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>321</start>
<end>330</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Disentangling lexical and grammatical information in word embeddings
%A Liu, Li
%A Lareau, François
%Y Evang, Kilian
%Y Kallmeyer, Laura
%Y Pogodalla, Sylvain
%S Proceedings of the 16th International Conference on Computational Semantics
%D 2025
%8 September
%I Association for Computational Linguistics
%C Düsseldorf, Germany
%@ 979-8-89176-316-6
%F liu-lareau-2025-disentangling
%X To enable finer-grained linguistic analysis, we propose a method for the separation of lexical and grammatical information within contextualized word embeddings. Using CamemBERT embeddings for French, we apply our method to 14,472 inflected word forms extracted from the Lexical Network of French ( LN-fr ), covering 1,468 nouns, 202 adjectives and 299 verbs inflected via 14 distinct grammatical feature values. Our iterative distillation alternates two steps until convergence: (i) estimating lexical or grammatical vectors by averaging the embeddings of words that share the same lexeme or grammatical feature value, and (ii) isolating the complementary component of each word embedding by subtracting the estimated vector. To assess the quality of the decomposition, we measure whether the resulting lexical and grammatical vectors form more compact clusters within their respective groups and whether their sum better reconstructs the original word embeddings. All evaluations rely on L2 distance. The observed improvements in both clustering and reconstruction accuracy demonstrate the effectiveness of our approach.
%U https://aclanthology.org/2025.iwcs-main.28/
%P 321-330
Markdown (Informal)
[Disentangling lexical and grammatical information in word embeddings](https://aclanthology.org/2025.iwcs-main.28/) (Liu & Lareau, IWCS 2025)
ACL