@inproceedings{karatepe-etal-2026-sahara,
title = "{S}ahara Tokenizers at {PARSEME} 2.0 Subtask 1: Combining Contextual Embeddings with Structural Decoding for Multi-Word Expression Detection",
author = {Karatepe, Yunus and
S{\"u}l{\"u}k, Mert and
K{\i}r{\i}ml{\i}, Zeynep Tu{\u{g}}{\c{c}}e and
{\"O}zbay, Beg{\"u}m},
editor = {Ojha, Atul Kr. and
Mititelu, Verginica Barbu and
Constant, Mathieu and
Stoyanova, Ivelina and
Do{\u{g}}ru{\"o}z, A. Seza and
Rademaker, Alexandre},
booktitle = "Proceedings of the 22nd Workshop on Multiword Expressions ({MWE} 2026)",
month = mar,
year = "2026",
address = "Rabat, Marocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mwe-1.20/",
pages = "154--159",
ISBN = "979-8-89176-363-0",
abstract = "Multi-Word Expressions (MWEs) pose a significant challenge for natural language processing systems due to their idiosyncratic semantic and syntactic properties. This paper describes our system for the PARSEME 2.0 Shared Task on automatic identification of verbal MWEs across 17 typologically diverse languages. Our approach combines multilingual BERT with explicit Part-of-Speech (POS) feature injection through a dual-head architecture that jointly performs BIO-based identification and category classification. We further investigate extensions, including Conditional Random Field (CRF) decoding for structured prediction, focal loss for addressing class imbalance, and model ensembling for improving discontinuous MWE detection. Our official submission achieves a global MWE-based F1 score of 48.39{\%}, securing second place in the shared task. Ablation studies reveal a strong synergy between POS features and CRF decoding, with the combined approach yielding the best single-model performance. Furthermore, ensembling models trained with different objectives improves both overall F1 score and discontinuous MWE scores, demonstrating the importance of training diversity for capturing non-adjacent syntactic patterns."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karatepe-etal-2026-sahara">
<titleInfo>
<title>Sahara Tokenizers at PARSEME 2.0 Subtask 1: Combining Contextual Embeddings with Structural Decoding for Multi-Word Expression Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunus</namePart>
<namePart type="family">Karatepe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mert</namePart>
<namePart type="family">Sülük</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeynep</namePart>
<namePart type="given">Tuğçe</namePart>
<namePart type="family">Kırımlı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Begüm</namePart>
<namePart type="family">Özbay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verginica</namePart>
<namePart type="given">Barbu</namePart>
<namePart type="family">Mititelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivelina</namePart>
<namePart type="family">Stoyanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Rademaker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Marocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-363-0</identifier>
</relatedItem>
<abstract>Multi-Word Expressions (MWEs) pose a significant challenge for natural language processing systems due to their idiosyncratic semantic and syntactic properties. This paper describes our system for the PARSEME 2.0 Shared Task on automatic identification of verbal MWEs across 17 typologically diverse languages. Our approach combines multilingual BERT with explicit Part-of-Speech (POS) feature injection through a dual-head architecture that jointly performs BIO-based identification and category classification. We further investigate extensions, including Conditional Random Field (CRF) decoding for structured prediction, focal loss for addressing class imbalance, and model ensembling for improving discontinuous MWE detection. Our official submission achieves a global MWE-based F1 score of 48.39%, securing second place in the shared task. Ablation studies reveal a strong synergy between POS features and CRF decoding, with the combined approach yielding the best single-model performance. Furthermore, ensembling models trained with different objectives improves both overall F1 score and discontinuous MWE scores, demonstrating the importance of training diversity for capturing non-adjacent syntactic patterns.</abstract>
<identifier type="citekey">karatepe-etal-2026-sahara</identifier>
<location>
<url>https://aclanthology.org/2026.mwe-1.20/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>154</start>
<end>159</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sahara Tokenizers at PARSEME 2.0 Subtask 1: Combining Contextual Embeddings with Structural Decoding for Multi-Word Expression Detection
%A Karatepe, Yunus
%A Sülük, Mert
%A Kırımlı, Zeynep Tuğçe
%A Özbay, Begüm
%Y Ojha, Atul Kr.
%Y Mititelu, Verginica Barbu
%Y Constant, Mathieu
%Y Stoyanova, Ivelina
%Y Doğruöz, A. Seza
%Y Rademaker, Alexandre
%S Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Marocco
%@ 979-8-89176-363-0
%F karatepe-etal-2026-sahara
%X Multi-Word Expressions (MWEs) pose a significant challenge for natural language processing systems due to their idiosyncratic semantic and syntactic properties. This paper describes our system for the PARSEME 2.0 Shared Task on automatic identification of verbal MWEs across 17 typologically diverse languages. Our approach combines multilingual BERT with explicit Part-of-Speech (POS) feature injection through a dual-head architecture that jointly performs BIO-based identification and category classification. We further investigate extensions, including Conditional Random Field (CRF) decoding for structured prediction, focal loss for addressing class imbalance, and model ensembling for improving discontinuous MWE detection. Our official submission achieves a global MWE-based F1 score of 48.39%, securing second place in the shared task. Ablation studies reveal a strong synergy between POS features and CRF decoding, with the combined approach yielding the best single-model performance. Furthermore, ensembling models trained with different objectives improves both overall F1 score and discontinuous MWE scores, demonstrating the importance of training diversity for capturing non-adjacent syntactic patterns.
%U https://aclanthology.org/2026.mwe-1.20/
%P 154-159
Markdown (Informal)
[Sahara Tokenizers at PARSEME 2.0 Subtask 1: Combining Contextual Embeddings with Structural Decoding for Multi-Word Expression Detection](https://aclanthology.org/2026.mwe-1.20/) (Karatepe et al., MWE 2026)
ACL