@inproceedings{chirkunov-etal-2026-linear,
title = "Linear Semantic Segmentation for Low-Resource Spoken Dialects",
author = "Chirkunov, Kirill and
Samih, Younes and
Freihat, Abed Alhakim and
Aldarmaki, Hanan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1740/",
pages = "34844--34861",
ISBN = "979-8-89176-395-1",
abstract = "Semantic segmentation is a core component of discourse analysis, yet existing models are primarily developed and evaluated on high-resource written text, limiting their effectiveness on low-resource conversational varieties. In particular, dialectal Arabic exhibits informal syntax, code-switching, and weakly marked discourse structure that challenge standard semantic segmentation approaches for text. In this paper, we introduce a new multi-genre benchmark (more than 1000 samples) for semantic segmentation in Arabic, focusing on dialectal discourse. The benchmark covers casual telephone conversations, code-switched podcasts, expressive dialogue, and broadcast news, and was annotated and validated by native Arabic annotators. Using this benchmark, we show that segmentation models performing well on MSA news genres degrade on dialectal conversational texts. We further propose a segmentation model that targets local semantic coherence and robustness to discourse discontinuities, consistently outperforming strong baselines on dialectal non-news genres. The benchmark and approach generalize to other low-resource spoken languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chirkunov-etal-2026-linear">
<titleInfo>
<title>Linear Semantic Segmentation for Low-Resource Spoken Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kirill</namePart>
<namePart type="family">Chirkunov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younes</namePart>
<namePart type="family">Samih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abed</namePart>
<namePart type="given">Alhakim</namePart>
<namePart type="family">Freihat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanan</namePart>
<namePart type="family">Aldarmaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Semantic segmentation is a core component of discourse analysis, yet existing models are primarily developed and evaluated on high-resource written text, limiting their effectiveness on low-resource conversational varieties. In particular, dialectal Arabic exhibits informal syntax, code-switching, and weakly marked discourse structure that challenge standard semantic segmentation approaches for text. In this paper, we introduce a new multi-genre benchmark (more than 1000 samples) for semantic segmentation in Arabic, focusing on dialectal discourse. The benchmark covers casual telephone conversations, code-switched podcasts, expressive dialogue, and broadcast news, and was annotated and validated by native Arabic annotators. Using this benchmark, we show that segmentation models performing well on MSA news genres degrade on dialectal conversational texts. We further propose a segmentation model that targets local semantic coherence and robustness to discourse discontinuities, consistently outperforming strong baselines on dialectal non-news genres. The benchmark and approach generalize to other low-resource spoken languages.</abstract>
<identifier type="citekey">chirkunov-etal-2026-linear</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1740/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>34844</start>
<end>34861</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Linear Semantic Segmentation for Low-Resource Spoken Dialects
%A Chirkunov, Kirill
%A Samih, Younes
%A Freihat, Abed Alhakim
%A Aldarmaki, Hanan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chirkunov-etal-2026-linear
%X Semantic segmentation is a core component of discourse analysis, yet existing models are primarily developed and evaluated on high-resource written text, limiting their effectiveness on low-resource conversational varieties. In particular, dialectal Arabic exhibits informal syntax, code-switching, and weakly marked discourse structure that challenge standard semantic segmentation approaches for text. In this paper, we introduce a new multi-genre benchmark (more than 1000 samples) for semantic segmentation in Arabic, focusing on dialectal discourse. The benchmark covers casual telephone conversations, code-switched podcasts, expressive dialogue, and broadcast news, and was annotated and validated by native Arabic annotators. Using this benchmark, we show that segmentation models performing well on MSA news genres degrade on dialectal conversational texts. We further propose a segmentation model that targets local semantic coherence and robustness to discourse discontinuities, consistently outperforming strong baselines on dialectal non-news genres. The benchmark and approach generalize to other low-resource spoken languages.
%U https://aclanthology.org/2026.findings-acl.1740/
%P 34844-34861
Markdown (Informal)
[Linear Semantic Segmentation for Low-Resource Spoken Dialects](https://aclanthology.org/2026.findings-acl.1740/) (Chirkunov et al., Findings 2026)
ACL
- Kirill Chirkunov, Younes Samih, Abed Alhakim Freihat, and Hanan Aldarmaki. 2026. Linear Semantic Segmentation for Low-Resource Spoken Dialects. In Findings of the Association for Computational Linguistics: ACL 2026, pages 34844–34861, San Diego, California, United States. Association for Computational Linguistics.