import re from functools import lru_cache from nltk.corpus import cmudict class PhoneticProcessor: def __init__(self): self.dict = self._load_cmudict() @staticmethod def _load_cmudict(): try: data = cmudict.dict() return data except LookupError: return {} @lru_cache(maxsize=8192) def normalize_word(self, word: str) -> str: """Standardizes word for dictionary lookup.""" # Lowercase, remove non-alphanumeric except apostrophes word = word.lower().strip() word = re.sub(r"[^a-z']", "", word) # Handle common rap contractions if word.endswith("in'"): word = word[:-1] + "g" return word @lru_cache(maxsize=8192) def get_phonemes(self, word: str): """Returns a list of possible phoneme lists for a word.""" normalized = self.normalize_word(word) return tuple(tuple(phones) for phones in self.dict.get(normalized, [])) # Singleton instance processor = PhoneticProcessor()