2026-02-24 13:22:10 -06:00

38 lines
1.0 KiB
Python

import re
from functools import lru_cache
from nltk.corpus import cmudict
class PhoneticProcessor:
def __init__(self):
self.dict = self._load_cmudict()
@staticmethod
def _load_cmudict():
try:
data = cmudict.dict()
return data
except LookupError:
return {}
@lru_cache(maxsize=8192)
def normalize_word(self, word: str) -> str:
"""Standardizes word for dictionary lookup."""
# Lowercase, remove non-alphanumeric except apostrophes
word = word.lower().strip()
word = re.sub(r"[^a-z']", "", word)
# Handle common rap contractions
if word.endswith("in'"):
word = word[:-1] + "g"
return word
@lru_cache(maxsize=8192)
def get_phonemes(self, word: str):
"""Returns a list of possible phoneme lists for a word."""
normalized = self.normalize_word(word)
return tuple(tuple(phones) for phones in self.dict.get(normalized, []))
# Singleton instance
processor = PhoneticProcessor()