From 7f4ffa99c493a177d6480f180a4ee84d0c1dc85c Mon Sep 17 00:00:00 2001 From: ManthanNimodiya Date: Sun, 28 Jun 2026 12:25:56 +0530 Subject: [PATCH 1/3] fix(captions): serialise transcription and release Whisper context to prevent Metal memory exhaustion --- apps/desktop/src-tauri/src/captions.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index 0b8d6fe06ae..cb0b9aa8bc6 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -53,6 +53,11 @@ impl Default for CaptionData { lazy_static::lazy_static! { static ref WHISPER_CONTEXT: Arc>>> = Arc::new(Mutex::new(None)); + // Serialises transcription so at most one WhisperState / Parakeet session + // exists at a time. On Apple Silicon each WhisperState allocates ~700 MB + // of Metal (unified) memory. Without this lock, rapid re-clicks create N + // concurrent states and exhaust RAM (observed: 44 GB for ~60 retries). + static ref TRANSCRIPTION_LOCK: Mutex<()> = Mutex::new(()); } #[cfg(not(all(target_os = "macos", target_arch = "x86_64")))] @@ -1107,6 +1112,12 @@ pub async fn transcribe_audio( ); } + // Hold the lock for the entire blocking call so that at most one + // WhisperState / Parakeet session exists at a time. Without this, rapid + // re-clicks spawn N concurrent sessions each consuming ~700 MB of Metal + // (unified) memory on Apple Silicon, which produced the observed 44 GB spike. + let _transcription_guard = TRANSCRIPTION_LOCK.lock().await; + let transcription_result = match engine { TranscriptionEngine::Parakeet => { log::info!("Using Parakeet TDT engine"); @@ -1134,11 +1145,21 @@ pub async fn transcribe_audio( .unwrap_or_default(); log::info!("Starting Whisper transcription in blocking task..."); - tokio::task::spawn_blocking(move || { + let result = tokio::task::spawn_blocking(move || { process_with_whisper(&audio_path, context, &language, &transcription_hints) }) .await - .map_err(|e| format!("Whisper task panicked: {e}"))? + .map_err(|e| format!("Whisper task panicked: {e}"))?; + + // Release the cached context immediately after use so Metal buffers + // (~500 MB on Apple Silicon) are freed rather than held until the + // editor closes. The next call will reload the model as needed. + { + let mut ctx = WHISPER_CONTEXT.lock().await; + *ctx = None; + } + + result } }; From 3923ac74ac0e94c40b6a8a5ecf83ac41eeb39f9c Mon Sep 17 00:00:00 2001 From: ManthanNimodiya Date: Sun, 28 Jun 2026 17:43:47 +0530 Subject: [PATCH 2/3] fix(captions): move serialisation lock into spawn_blocking and gate context eviction to aarch64 --- apps/desktop/src-tauri/src/captions.rs | 36 ++++++++++++++------------ 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index cb0b9aa8bc6..955311791cd 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -53,11 +53,13 @@ impl Default for CaptionData { lazy_static::lazy_static! { static ref WHISPER_CONTEXT: Arc>>> = Arc::new(Mutex::new(None)); - // Serialises transcription so at most one WhisperState / Parakeet session - // exists at a time. On Apple Silicon each WhisperState allocates ~700 MB - // of Metal (unified) memory. Without this lock, rapid re-clicks create N - // concurrent states and exhaust RAM (observed: 44 GB for ~60 retries). - static ref TRANSCRIPTION_LOCK: Mutex<()> = Mutex::new(()); + // std::sync::Mutex so the guard is held by the blocking thread itself. + // If the async future is dropped mid-transcription the blocking thread + // continues to hold this lock until it finishes, preventing a racing + // retry from spawning a second ML session concurrently. + // On Apple Silicon each WhisperState allocates ~700 MB of Metal (unified) + // memory; without serialisation rapid re-clicks exhausted 44 GB of RAM. + static ref TRANSCRIPTION_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); } #[cfg(not(all(target_os = "macos", target_arch = "x86_64")))] @@ -1112,19 +1114,16 @@ pub async fn transcribe_audio( ); } - // Hold the lock for the entire blocking call so that at most one - // WhisperState / Parakeet session exists at a time. Without this, rapid - // re-clicks spawn N concurrent sessions each consuming ~700 MB of Metal - // (unified) memory on Apple Silicon, which produced the observed 44 GB spike. - let _transcription_guard = TRANSCRIPTION_LOCK.lock().await; - let transcription_result = match engine { TranscriptionEngine::Parakeet => { log::info!("Using Parakeet TDT engine"); let model_dir = model_path.clone(); - tokio::task::spawn_blocking(move || process_with_parakeet(&audio_path, &model_dir)) - .await - .map_err(|e| format!("Parakeet task panicked: {e}"))? + tokio::task::spawn_blocking(move || { + let _guard = TRANSCRIPTION_LOCK.lock().unwrap(); + process_with_parakeet(&audio_path, &model_dir) + }) + .await + .map_err(|e| format!("Parakeet task panicked: {e}"))? } TranscriptionEngine::Whisper => { let context = match get_whisper_context(&model_path).await { @@ -1146,14 +1145,17 @@ pub async fn transcribe_audio( log::info!("Starting Whisper transcription in blocking task..."); let result = tokio::task::spawn_blocking(move || { + let _guard = TRANSCRIPTION_LOCK.lock().unwrap(); process_with_whisper(&audio_path, context, &language, &transcription_hints) }) .await .map_err(|e| format!("Whisper task panicked: {e}"))?; - // Release the cached context immediately after use so Metal buffers - // (~500 MB on Apple Silicon) are freed rather than held until the - // editor closes. The next call will reload the model as needed. + // Release the cached context so Metal buffers (~500 MB on Apple Silicon) + // are freed after each run rather than held until the editor closes. + // Gated to aarch64 only: on other platforms the cache improves + // repeated-transcription latency with no meaningful memory cost. + #[cfg(all(target_os = "macos", target_arch = "aarch64"))] { let mut ctx = WHISPER_CONTEXT.lock().await; *ctx = None; From e64725b57196cfebb3726f00d4e3af953245c00b Mon Sep 17 00:00:00 2001 From: ManthanNimodiya Date: Mon, 29 Jun 2026 22:20:59 +0530 Subject: [PATCH 3/3] fix(captions): recover from poisoned mutex instead of panicking --- apps/desktop/src-tauri/src/captions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index 955311791cd..722b6b11264 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -1119,7 +1119,7 @@ pub async fn transcribe_audio( log::info!("Using Parakeet TDT engine"); let model_dir = model_path.clone(); tokio::task::spawn_blocking(move || { - let _guard = TRANSCRIPTION_LOCK.lock().unwrap(); + let _guard = TRANSCRIPTION_LOCK.lock().unwrap_or_else(|p| p.into_inner()); process_with_parakeet(&audio_path, &model_dir) }) .await @@ -1145,7 +1145,7 @@ pub async fn transcribe_audio( log::info!("Starting Whisper transcription in blocking task..."); let result = tokio::task::spawn_blocking(move || { - let _guard = TRANSCRIPTION_LOCK.lock().unwrap(); + let _guard = TRANSCRIPTION_LOCK.lock().unwrap_or_else(|p| p.into_inner()); process_with_whisper(&audio_path, context, &language, &transcription_hints) }) .await