From ee96c05d154802916ec1163f1e17d11415f0d071 Mon Sep 17 00:00:00 2001 From: Jacob Schmidt Date: Sun, 1 Mar 2026 00:39:11 -0600 Subject: [PATCH] Filter silent audio and blank placeholder transcripts in S2T --- Journal.Sidecar/LocalWhisperS2TService.cs | 37 +++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Journal.Sidecar/LocalWhisperS2TService.cs b/Journal.Sidecar/LocalWhisperS2TService.cs index 2f6a2f2..26fe532 100644 --- a/Journal.Sidecar/LocalWhisperS2TService.cs +++ b/Journal.Sidecar/LocalWhisperS2TService.cs @@ -14,6 +14,7 @@ public sealed class LocalWhisperS2TService : IS2TService, IDisposable private const int Channels = 1; private const int ChunkMs = 2000; private const int MaxBufferedItems = 256; + private const int SilenceRmsThreshold = 150; private readonly object _sync = new(); private readonly object _segmentLock = new(); @@ -202,6 +203,9 @@ public sealed class LocalWhisperS2TService : IS2TService, IDisposable { try { + if (IsLikelySilence(pcmChunk)) + continue; + using var pcmStream = new MemoryStream(pcmChunk, writable: false); using var raw = new RawSourceWaveStream(pcmStream, waveFormat); using var wavStream = new MemoryStream(); @@ -213,6 +217,8 @@ public sealed class LocalWhisperS2TService : IS2TService, IDisposable var text = result.Text?.Trim(); if (string.IsNullOrWhiteSpace(text)) continue; + if (IsPlaceholderTranscript(text)) + continue; EnqueueTranscript(text); } } @@ -253,6 +259,37 @@ public sealed class LocalWhisperS2TService : IS2TService, IDisposable return modelPath; } + private static bool IsLikelySilence(byte[] pcmChunk) + { + if (pcmChunk.Length < 2) + return true; + + long sumSquares = 0; + int samples = pcmChunk.Length / 2; + for (int i = 0; i + 1 < pcmChunk.Length; i += 2) + { + short sample = (short)(pcmChunk[i] | (pcmChunk[i + 1] << 8)); + sumSquares += (long)sample * sample; + } + + if (samples <= 0) + return true; + + var rms = Math.Sqrt(sumSquares / (double)samples); + return rms < SilenceRmsThreshold; + } + + private static bool IsPlaceholderTranscript(string text) + { + var normalized = text.Trim(); + if (!(normalized.StartsWith('[') && normalized.EndsWith(']'))) + return false; + + return normalized.Equals("[BLANK_AUDIO]", StringComparison.OrdinalIgnoreCase) + || normalized.Equals("[NO AUDIO]", StringComparison.OrdinalIgnoreCase) + || normalized.Equals("[SILENCE]", StringComparison.OrdinalIgnoreCase); + } + private void EnqueueTranscript(string text) { _transcripts.Enqueue(text);