@inproceedings{jenkins-etal-2023-split,
title = "To Split or Not to Split: Composing Compounds in Contextual Vector Spaces",
author = "Jenkins, Chris and
Miletic, Filip and
Schulte im Walde, Sabine",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.1002",
doi = "10.18653/v1/2023.emnlp-main.1002",
pages = "16131--16136",
abstract = "We investigate the effect of sub-word tokenization on representations of German noun compounds: single orthographic words which are composed of two or more constituents but often tokenized into units that are not morphologically motivated or meaningful. Using variants of BERT models and tokenization strategies on domain-specific restricted diachronic data, we introduce a suite of evaluations relying on the masked language modelling task and compositionality prediction. We obtain the most consistent improvements by pre-splitting compounds into constituents.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jenkins-etal-2023-split">
<titleInfo>
<title>To Split or Not to Split: Composing Compounds in Contextual Vector Spaces</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Jenkins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Miletic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabine</namePart>
<namePart type="family">Schulte im Walde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate the effect of sub-word tokenization on representations of German noun compounds: single orthographic words which are composed of two or more constituents but often tokenized into units that are not morphologically motivated or meaningful. Using variants of BERT models and tokenization strategies on domain-specific restricted diachronic data, we introduce a suite of evaluations relying on the masked language modelling task and compositionality prediction. We obtain the most consistent improvements by pre-splitting compounds into constituents.</abstract>
<identifier type="citekey">jenkins-etal-2023-split</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.1002</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.1002</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>16131</start>
<end>16136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T To Split or Not to Split: Composing Compounds in Contextual Vector Spaces
%A Jenkins, Chris
%A Miletic, Filip
%A Schulte im Walde, Sabine
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F jenkins-etal-2023-split
%X We investigate the effect of sub-word tokenization on representations of German noun compounds: single orthographic words which are composed of two or more constituents but often tokenized into units that are not morphologically motivated or meaningful. Using variants of BERT models and tokenization strategies on domain-specific restricted diachronic data, we introduce a suite of evaluations relying on the masked language modelling task and compositionality prediction. We obtain the most consistent improvements by pre-splitting compounds into constituents.
%R 10.18653/v1/2023.emnlp-main.1002
%U https://aclanthology.org/2023.emnlp-main.1002
%U https://doi.org/10.18653/v1/2023.emnlp-main.1002
%P 16131-16136
Markdown (Informal)
[To Split or Not to Split: Composing Compounds in Contextual Vector Spaces](https://aclanthology.org/2023.emnlp-main.1002) (Jenkins et al., EMNLP 2023)
ACL