@inproceedings{znotins-gruzitis-2025-conversational,
title = "From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting",
author = "Znotins, Arturs and
Gruzitis, Normunds and
Dargis, Roberts",
editor = "Bak, JinYeong and
Goot, Rob van der and
Jang, Hyeju and
Buaphet, Weerayut and
Ramponi, Alan and
Xu, Wei and
Ritter, Alan",
booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wnut-1.15/",
doi = "10.18653/v1/2025.wnut-1.15",
pages = "143--148",
ISBN = "979-8-89176-232-9",
abstract = "We present ongoing research on automatic post-processing approaches to enhance the readability of noisy speech transcripts in low-resource languages, with a focus on conversational speech in Latvian. We compare transformer-based sequence-labeling models and large language models (LLMs) for the standard punctuation and capitalization restoration task, while also considering automatic correction of mispronounced words and disfluency, and partial inverse text normalization. Our results show that very small LLMs (approx. 2B parameters), fine-tuned on a modest text corpus, can achieve near state-of-the-art performance, rivaling orders of magnitude larger LLMs. Additionally, we demonstrate that a fine-tuned Whisper model, leveraging acoustic cues, outperforms text-only systems on challenging conversational data, even for a low-resource language. Error analysis reveals recurring pitfalls in sentence boundary determination and disfluency handling, emphasizing the importance of consistent annotation and domain adaptation for robust post-processing. Our findings highlight the feasibility of developing efficient post-processing solutions that significantly refine ASR output in low-resource settings, while opening new possibilities for editing and formatting speech transcripts beyond mere restoration of punctuation and capitalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="znotins-gruzitis-2025-conversational">
<titleInfo>
<title>From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arturs</namePart>
<namePart type="family">Znotins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Normunds</namePart>
<namePart type="family">Gruzitis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberts</namePart>
<namePart type="family">Dargis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth Workshop on Noisy and User-generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">JinYeong</namePart>
<namePart type="family">Bak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="given">van</namePart>
<namePart type="given">der</namePart>
<namePart type="family">Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyeju</namePart>
<namePart type="family">Jang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weerayut</namePart>
<namePart type="family">Buaphet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ramponi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-232-9</identifier>
</relatedItem>
<abstract>We present ongoing research on automatic post-processing approaches to enhance the readability of noisy speech transcripts in low-resource languages, with a focus on conversational speech in Latvian. We compare transformer-based sequence-labeling models and large language models (LLMs) for the standard punctuation and capitalization restoration task, while also considering automatic correction of mispronounced words and disfluency, and partial inverse text normalization. Our results show that very small LLMs (approx. 2B parameters), fine-tuned on a modest text corpus, can achieve near state-of-the-art performance, rivaling orders of magnitude larger LLMs. Additionally, we demonstrate that a fine-tuned Whisper model, leveraging acoustic cues, outperforms text-only systems on challenging conversational data, even for a low-resource language. Error analysis reveals recurring pitfalls in sentence boundary determination and disfluency handling, emphasizing the importance of consistent annotation and domain adaptation for robust post-processing. Our findings highlight the feasibility of developing efficient post-processing solutions that significantly refine ASR output in low-resource settings, while opening new possibilities for editing and formatting speech transcripts beyond mere restoration of punctuation and capitalization.</abstract>
<identifier type="citekey">znotins-gruzitis-2025-conversational</identifier>
<identifier type="doi">10.18653/v1/2025.wnut-1.15</identifier>
<location>
<url>https://aclanthology.org/2025.wnut-1.15/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>143</start>
<end>148</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting
%A Znotins, Arturs
%A Gruzitis, Normunds
%A Dargis, Roberts
%Y Bak, JinYeong
%Y Goot, Rob van der
%Y Jang, Hyeju
%Y Buaphet, Weerayut
%Y Ramponi, Alan
%Y Xu, Wei
%Y Ritter, Alan
%S Proceedings of the Tenth Workshop on Noisy and User-generated Text
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, USA
%@ 979-8-89176-232-9
%F znotins-gruzitis-2025-conversational
%X We present ongoing research on automatic post-processing approaches to enhance the readability of noisy speech transcripts in low-resource languages, with a focus on conversational speech in Latvian. We compare transformer-based sequence-labeling models and large language models (LLMs) for the standard punctuation and capitalization restoration task, while also considering automatic correction of mispronounced words and disfluency, and partial inverse text normalization. Our results show that very small LLMs (approx. 2B parameters), fine-tuned on a modest text corpus, can achieve near state-of-the-art performance, rivaling orders of magnitude larger LLMs. Additionally, we demonstrate that a fine-tuned Whisper model, leveraging acoustic cues, outperforms text-only systems on challenging conversational data, even for a low-resource language. Error analysis reveals recurring pitfalls in sentence boundary determination and disfluency handling, emphasizing the importance of consistent annotation and domain adaptation for robust post-processing. Our findings highlight the feasibility of developing efficient post-processing solutions that significantly refine ASR output in low-resource settings, while opening new possibilities for editing and formatting speech transcripts beyond mere restoration of punctuation and capitalization.
%R 10.18653/v1/2025.wnut-1.15
%U https://aclanthology.org/2025.wnut-1.15/
%U https://doi.org/10.18653/v1/2025.wnut-1.15
%P 143-148
Markdown (Informal)
[From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting](https://aclanthology.org/2025.wnut-1.15/) (Znotins et al., WNUT 2025)
ACL