@inproceedings{kim-etal-2026-wigvo,
title = "{WIGVO}: Real-Time Bidirectional Speech Translation over Legacy {PSTN} Calls via Dual-Session Echo Gating",
author = "Kim, Hyeong-seob and
Son, Sang-Woo and
Cho, Hyun-woo and
Kim, Hyeonsang and
Kim, Jinmo",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-demo.33/",
pages = "336--344",
ISBN = "979-8-89176-392-0",
abstract = "Real-time speech translation with large language models (LLMs) has become feasible in controlled wideband settings{---}mobile apps, web browsers, and end-to-end full-duplex systems pushing latency below 200 ms{---}where developers can assume client-side echo cancellation. However, deploying such systems over the Public Switched Telephone Network (PSTN) remains challenging due to narrowband G.711 audio, unpredictable round-trip delays, and absence of client-side signal processing. We present **WIGVO** (WIGTN Voice-Only), a server-side relay system that enables bidirectional LLM-based speech translation over ordinary telephone calls without requiring app installation or carrier integration. A central contribution is addressing what we term *echo-induced self-reinforcing translation loops*: synthesized speech echoing back through the PSTN gets re-ingested and repeatedly translated. WIGVO solves this through a dual-session architecture with deterministic silence injection and energy-based voice activity detection (VAD) gating. We evaluate WIGVO on 155 Korean{--}English PSTN calls (148 instrumented, 147 completed) across three communication modes{---}voice-to-voice (V2V), text-to-voice (T2V), and full-agent{---}observing 555 ms median caller-to-callee latency and 2,684 ms median callee-to-caller latency, zero echo-induced translation loops, COMET semantic adequacy of 0.71 (en{\textrightarrow}ko) and 0.62 (ko{\textrightarrow}en) against offline LLM references, and USD 0.28 per minute cost. The system is deployed at https://wigvo.wigtn.com, with a video walkthrough at https://youtu.be/4Uf6zMPOInY. Evaluation scripts and anonymized call logs are available in the open-source repository."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2026-wigvo">
<titleInfo>
<title>WIGVO: Real-Time Bidirectional Speech Translation over Legacy PSTN Calls via Dual-Session Echo Gating</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyeong-seob</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sang-Woo</namePart>
<namePart type="family">Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyun-woo</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyeonsang</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinmo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-392-0</identifier>
</relatedItem>
<abstract>Real-time speech translation with large language models (LLMs) has become feasible in controlled wideband settings—mobile apps, web browsers, and end-to-end full-duplex systems pushing latency below 200 ms—where developers can assume client-side echo cancellation. However, deploying such systems over the Public Switched Telephone Network (PSTN) remains challenging due to narrowband G.711 audio, unpredictable round-trip delays, and absence of client-side signal processing. We present **WIGVO** (WIGTN Voice-Only), a server-side relay system that enables bidirectional LLM-based speech translation over ordinary telephone calls without requiring app installation or carrier integration. A central contribution is addressing what we term *echo-induced self-reinforcing translation loops*: synthesized speech echoing back through the PSTN gets re-ingested and repeatedly translated. WIGVO solves this through a dual-session architecture with deterministic silence injection and energy-based voice activity detection (VAD) gating. We evaluate WIGVO on 155 Korean–English PSTN calls (148 instrumented, 147 completed) across three communication modes—voice-to-voice (V2V), text-to-voice (T2V), and full-agent—observing 555 ms median caller-to-callee latency and 2,684 ms median callee-to-caller latency, zero echo-induced translation loops, COMET semantic adequacy of 0.71 (en→ko) and 0.62 (ko→en) against offline LLM references, and USD 0.28 per minute cost. The system is deployed at https://wigvo.wigtn.com, with a video walkthrough at https://youtu.be/4Uf6zMPOInY. Evaluation scripts and anonymized call logs are available in the open-source repository.</abstract>
<identifier type="citekey">kim-etal-2026-wigvo</identifier>
<location>
<url>https://aclanthology.org/2026.acl-demo.33/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>336</start>
<end>344</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WIGVO: Real-Time Bidirectional Speech Translation over Legacy PSTN Calls via Dual-Session Echo Gating
%A Kim, Hyeong-seob
%A Son, Sang-Woo
%A Cho, Hyun-woo
%A Kim, Hyeonsang
%A Kim, Jinmo
%Y Durrett, Greg
%Y Jian, Ping
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-392-0
%F kim-etal-2026-wigvo
%X Real-time speech translation with large language models (LLMs) has become feasible in controlled wideband settings—mobile apps, web browsers, and end-to-end full-duplex systems pushing latency below 200 ms—where developers can assume client-side echo cancellation. However, deploying such systems over the Public Switched Telephone Network (PSTN) remains challenging due to narrowband G.711 audio, unpredictable round-trip delays, and absence of client-side signal processing. We present **WIGVO** (WIGTN Voice-Only), a server-side relay system that enables bidirectional LLM-based speech translation over ordinary telephone calls without requiring app installation or carrier integration. A central contribution is addressing what we term *echo-induced self-reinforcing translation loops*: synthesized speech echoing back through the PSTN gets re-ingested and repeatedly translated. WIGVO solves this through a dual-session architecture with deterministic silence injection and energy-based voice activity detection (VAD) gating. We evaluate WIGVO on 155 Korean–English PSTN calls (148 instrumented, 147 completed) across three communication modes—voice-to-voice (V2V), text-to-voice (T2V), and full-agent—observing 555 ms median caller-to-callee latency and 2,684 ms median callee-to-caller latency, zero echo-induced translation loops, COMET semantic adequacy of 0.71 (en→ko) and 0.62 (ko→en) against offline LLM references, and USD 0.28 per minute cost. The system is deployed at https://wigvo.wigtn.com, with a video walkthrough at https://youtu.be/4Uf6zMPOInY. Evaluation scripts and anonymized call logs are available in the open-source repository.
%U https://aclanthology.org/2026.acl-demo.33/
%P 336-344
Markdown (Informal)
[WIGVO: Real-Time Bidirectional Speech Translation over Legacy PSTN Calls via Dual-Session Echo Gating](https://aclanthology.org/2026.acl-demo.33/) (Kim et al., ACL 2026)
ACL