@inproceedings{iranmanesh-etal-2026-segmentation,
title = "Segmentation Strategy Matters: Benchmarking Whisper on {P}ersian {Y}ou{T}ube Content",
author = "Iranmanesh, Reihaneh and
Ziaei, Rojin and
Garman, Joe",
editor = "Merchant, Rayyan and
Megerdoomian, Karine",
booktitle = "The Proceedings of the First Workshop on {NLP} and {LLM}s for the {I}ranian Language Family",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.silkroadnlp-1.13/",
pages = "121--130",
ISBN = "979-8-89176-371-5",
abstract = "Automatic Speech Recognition (ASR) transcription accuracy remains highly sensitive to audio segmentation strategies, yet most benchmarks assume oracle timestamps unavailable in deployment. We systematically evaluate how audio segmentation affects Whisper{'}s performance on 10 hours of Persian YouTube content, comparing transcript-aligned (oracle) versus silence-based (realistic) approaches across contrasting acoustic conditions. Results reveal striking content-type dependency: podcast content benefits from timestamp segmentation (33{\%} lower mean WER), while entertainment content favors silence-based segmentation (8{\%} lower mean WER). This finding demonstrates that optimal segmentation must be content-aware, with silence detection better capturing natural boundaries in acoustically heterogeneous media while avoiding mid-utterance splits. We publicly release our evaluation framework, 10 hours of audio with gold transcripts, and segmentation results here: https://github.com/ri164-bolleit/persian-youtube-whisper-benchmark"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="iranmanesh-etal-2026-segmentation">
<titleInfo>
<title>Segmentation Strategy Matters: Benchmarking Whisper on Persian YouTube Content</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reihaneh</namePart>
<namePart type="family">Iranmanesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rojin</namePart>
<namePart type="family">Ziaei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Garman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The Proceedings of the First Workshop on NLP and LLMs for the Iranian Language Family</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rayyan</namePart>
<namePart type="family">Merchant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karine</namePart>
<namePart type="family">Megerdoomian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-371-5</identifier>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) transcription accuracy remains highly sensitive to audio segmentation strategies, yet most benchmarks assume oracle timestamps unavailable in deployment. We systematically evaluate how audio segmentation affects Whisper’s performance on 10 hours of Persian YouTube content, comparing transcript-aligned (oracle) versus silence-based (realistic) approaches across contrasting acoustic conditions. Results reveal striking content-type dependency: podcast content benefits from timestamp segmentation (33% lower mean WER), while entertainment content favors silence-based segmentation (8% lower mean WER). This finding demonstrates that optimal segmentation must be content-aware, with silence detection better capturing natural boundaries in acoustically heterogeneous media while avoiding mid-utterance splits. We publicly release our evaluation framework, 10 hours of audio with gold transcripts, and segmentation results here: https://github.com/ri164-bolleit/persian-youtube-whisper-benchmark</abstract>
<identifier type="citekey">iranmanesh-etal-2026-segmentation</identifier>
<location>
<url>https://aclanthology.org/2026.silkroadnlp-1.13/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>121</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Segmentation Strategy Matters: Benchmarking Whisper on Persian YouTube Content
%A Iranmanesh, Reihaneh
%A Ziaei, Rojin
%A Garman, Joe
%Y Merchant, Rayyan
%Y Megerdoomian, Karine
%S The Proceedings of the First Workshop on NLP and LLMs for the Iranian Language Family
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-371-5
%F iranmanesh-etal-2026-segmentation
%X Automatic Speech Recognition (ASR) transcription accuracy remains highly sensitive to audio segmentation strategies, yet most benchmarks assume oracle timestamps unavailable in deployment. We systematically evaluate how audio segmentation affects Whisper’s performance on 10 hours of Persian YouTube content, comparing transcript-aligned (oracle) versus silence-based (realistic) approaches across contrasting acoustic conditions. Results reveal striking content-type dependency: podcast content benefits from timestamp segmentation (33% lower mean WER), while entertainment content favors silence-based segmentation (8% lower mean WER). This finding demonstrates that optimal segmentation must be content-aware, with silence detection better capturing natural boundaries in acoustically heterogeneous media while avoiding mid-utterance splits. We publicly release our evaluation framework, 10 hours of audio with gold transcripts, and segmentation results here: https://github.com/ri164-bolleit/persian-youtube-whisper-benchmark
%U https://aclanthology.org/2026.silkroadnlp-1.13/
%P 121-130
Markdown (Informal)
[Segmentation Strategy Matters: Benchmarking Whisper on Persian YouTube Content](https://aclanthology.org/2026.silkroadnlp-1.13/) (Iranmanesh et al., SilkRoadNLP 2026)
ACL