@inproceedings{dehouck-2023-challenging,
title = "Challenging the {``}One Single Vector per Token{''} Assumption",
author = "Dehouck, Mathieu",
editor = "Jiang, Jing and
Reitter, David and
Deng, Shumin",
booktitle = "Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.conll-1.33",
doi = "10.18653/v1/2023.conll-1.33",
pages = "498--507",
abstract = "In this paper we question the almost universal assumption that in neural networks each token should be represented by a single vector. In fact, it is so natural to use one vector per word that most people do not even consider it as an assumption of their various models. Via a series of experiments on dependency parsing, in which we let each token in a sentence be represented by a sequence of vectors, we show that the {``}one single vector per token{''} assumption might be too strong for recurrent neural networks. Indeed, biaffine parsers seem to work better when their encoder accesses its input{'}s tokens{'} representations in several time steps rather than all at once. This seems to indicate that having only one occasion to look at a token through its vector is too strong a constraint for recurrent neural networks and calls for further studies on the way tokens are fed to neural networks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dehouck-2023-challenging">
<titleInfo>
<title>Challenging the “One Single Vector per Token” Assumption</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Dehouck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Reitter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shumin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we question the almost universal assumption that in neural networks each token should be represented by a single vector. In fact, it is so natural to use one vector per word that most people do not even consider it as an assumption of their various models. Via a series of experiments on dependency parsing, in which we let each token in a sentence be represented by a sequence of vectors, we show that the “one single vector per token” assumption might be too strong for recurrent neural networks. Indeed, biaffine parsers seem to work better when their encoder accesses its input’s tokens’ representations in several time steps rather than all at once. This seems to indicate that having only one occasion to look at a token through its vector is too strong a constraint for recurrent neural networks and calls for further studies on the way tokens are fed to neural networks.</abstract>
<identifier type="citekey">dehouck-2023-challenging</identifier>
<identifier type="doi">10.18653/v1/2023.conll-1.33</identifier>
<location>
<url>https://aclanthology.org/2023.conll-1.33</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>498</start>
<end>507</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenging the “One Single Vector per Token” Assumption
%A Dehouck, Mathieu
%Y Jiang, Jing
%Y Reitter, David
%Y Deng, Shumin
%S Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F dehouck-2023-challenging
%X In this paper we question the almost universal assumption that in neural networks each token should be represented by a single vector. In fact, it is so natural to use one vector per word that most people do not even consider it as an assumption of their various models. Via a series of experiments on dependency parsing, in which we let each token in a sentence be represented by a sequence of vectors, we show that the “one single vector per token” assumption might be too strong for recurrent neural networks. Indeed, biaffine parsers seem to work better when their encoder accesses its input’s tokens’ representations in several time steps rather than all at once. This seems to indicate that having only one occasion to look at a token through its vector is too strong a constraint for recurrent neural networks and calls for further studies on the way tokens are fed to neural networks.
%R 10.18653/v1/2023.conll-1.33
%U https://aclanthology.org/2023.conll-1.33
%U https://doi.org/10.18653/v1/2023.conll-1.33
%P 498-507
Markdown (Informal)
[Challenging the “One Single Vector per Token” Assumption](https://aclanthology.org/2023.conll-1.33) (Dehouck, CoNLL 2023)
ACL