154 lines
5.5 KiB
Python
154 lines
5.5 KiB
Python
import speech_recognition as sr
|
|
from typing import Protocol
|
|
import queue
|
|
import sys
|
|
from .config import (
|
|
MICROPHONE_DEVICE_INDEX,
|
|
FASTER_WHISPER_DEVICE,
|
|
FASTER_WHISPER_COMPUTE_TYPE,
|
|
)
|
|
|
|
# Windows fallback: pyaudiowpatch ships wheels for newer Python versions
|
|
# where PyAudio source builds can fail due missing PortAudio headers.
|
|
try:
|
|
import pyaudio # type: ignore # noqa: F401
|
|
except Exception:
|
|
try:
|
|
import pyaudiowpatch as _pyaudio_patch # type: ignore
|
|
|
|
sys.modules.setdefault("pyaudio", _pyaudio_patch)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
class Stoppable(Protocol):
|
|
"""Protocol for a callable that can be stopped, like the background listener."""
|
|
|
|
def __call__(self, wait_for_stop: bool = True) -> None: ...
|
|
|
|
|
|
# This global variable will hold the background listening process handle
|
|
background_listener_stop: Stoppable | None = None
|
|
|
|
|
|
def _recognize_local_whisper(
|
|
recognizer: sr.Recognizer,
|
|
audio: sr.AudioData,
|
|
engine: str,
|
|
whisper_model: str,
|
|
faster_whisper_device: str | None = None,
|
|
faster_whisper_compute_type: str | None = None,
|
|
) -> str:
|
|
if engine == "faster-whisper":
|
|
if hasattr(recognizer, "recognize_faster_whisper"):
|
|
init_options: dict[str, str] = {}
|
|
active_device = (
|
|
(faster_whisper_device or "").strip().lower() or FASTER_WHISPER_DEVICE
|
|
)
|
|
active_compute = (
|
|
(faster_whisper_compute_type or "").strip().lower()
|
|
or FASTER_WHISPER_COMPUTE_TYPE
|
|
)
|
|
if active_device != "auto":
|
|
init_options["device"] = active_device
|
|
if active_compute not in {"auto", "default"}:
|
|
init_options["compute_type"] = active_compute
|
|
|
|
return recognizer.recognize_faster_whisper(
|
|
audio,
|
|
model=whisper_model,
|
|
init_options=init_options or None,
|
|
)
|
|
# Older/newer SpeechRecognition builds may not expose this wrapper.
|
|
return recognizer.recognize_whisper(audio, model=whisper_model)
|
|
|
|
return recognizer.recognize_whisper(audio, model=whisper_model)
|
|
|
|
|
|
def start_background_listening(
|
|
message_queue: queue.Queue[tuple[str, str]],
|
|
engine: str,
|
|
whisper_model: str,
|
|
faster_whisper_device: str | None = None,
|
|
faster_whisper_compute_type: str | None = None,
|
|
):
|
|
"""
|
|
Starts listening to the microphone in a background thread.
|
|
|
|
Puts status and result messages into the provided queue.
|
|
"""
|
|
global background_listener_stop
|
|
if background_listener_stop:
|
|
message_queue.put(("status", "Already listening."))
|
|
return
|
|
|
|
try:
|
|
recognizer = sr.Recognizer()
|
|
microphone = sr.Microphone(device_index=MICROPHONE_DEVICE_INDEX)
|
|
except (OSError, AttributeError) as e:
|
|
message_queue.put(("status", f"Error: No microphone found. ({e})"))
|
|
return
|
|
|
|
with microphone as source:
|
|
message_queue.put(("status", "Adjusting for ambient noise..."))
|
|
recognizer.adjust_for_ambient_noise(source)
|
|
message_queue.put(("status", "Listening..."))
|
|
|
|
def recognition_callback(recognizer: sr.Recognizer, audio: sr.AudioData) -> None:
|
|
message_queue.put(("status", "Processing..."))
|
|
try:
|
|
if engine == "google":
|
|
text = recognizer.recognize_google(audio)
|
|
elif engine in {"whisper", "faster-whisper"}:
|
|
# Use local Whisper-family inference for privacy.
|
|
# Models are downloaded automatically on first use.
|
|
if engine == "faster-whisper" and not hasattr(
|
|
recognizer, "recognize_faster_whisper"
|
|
):
|
|
message_queue.put(
|
|
(
|
|
"status",
|
|
"faster-whisper not exposed by this SpeechRecognition build; falling back to whisper.",
|
|
)
|
|
)
|
|
text = _recognize_local_whisper(
|
|
recognizer,
|
|
audio,
|
|
engine,
|
|
whisper_model,
|
|
faster_whisper_device=faster_whisper_device,
|
|
faster_whisper_compute_type=faster_whisper_compute_type,
|
|
)
|
|
else: # Default to the fast, offline, but less accurate sphinx
|
|
text = recognizer.recognize_sphinx(audio)
|
|
|
|
message_queue.put(
|
|
("result", f"{text} ")
|
|
) # Add space for continuous dictation
|
|
message_queue.put(("status", "Listening...")) # Ready for the next phrase
|
|
except sr.UnknownValueError:
|
|
message_queue.put(
|
|
("status", "Could not understand audio. Still listening...")
|
|
)
|
|
except sr.RequestError as e:
|
|
error_msg = f"API error: {e}"
|
|
if engine == "google":
|
|
error_msg += ". Check internet connection."
|
|
message_queue.put(("status", error_msg))
|
|
except Exception as e:
|
|
message_queue.put(
|
|
("status", f"An unexpected error occurred in speech recognition: {e}")
|
|
)
|
|
|
|
# `listen_in_background` returns a function to stop the background listener
|
|
stop_listening = recognizer.listen_in_background(microphone, recognition_callback)
|
|
background_listener_stop = stop_listening
|
|
|
|
|
|
def stop_background_listening():
|
|
"""Stops the background listener if it is running."""
|
|
global background_listener_stop
|
|
if background_listener_stop:
|
|
background_listener_stop(wait_for_stop=False)
|
|
background_listener_stop = None
|