38 lines
1.0 KiB
Python
38 lines
1.0 KiB
Python
import re
|
|
from functools import lru_cache
|
|
from nltk.corpus import cmudict
|
|
|
|
class PhoneticProcessor:
|
|
def __init__(self):
|
|
self.dict = self._load_cmudict()
|
|
|
|
@staticmethod
|
|
def _load_cmudict():
|
|
try:
|
|
data = cmudict.dict()
|
|
return data
|
|
except LookupError:
|
|
return {}
|
|
|
|
@lru_cache(maxsize=8192)
|
|
def normalize_word(self, word: str) -> str:
|
|
"""Standardizes word for dictionary lookup."""
|
|
# Lowercase, remove non-alphanumeric except apostrophes
|
|
word = word.lower().strip()
|
|
word = re.sub(r"[^a-z']", "", word)
|
|
|
|
# Handle common rap contractions
|
|
if word.endswith("in'"):
|
|
word = word[:-1] + "g"
|
|
|
|
return word
|
|
|
|
@lru_cache(maxsize=8192)
|
|
def get_phonemes(self, word: str):
|
|
"""Returns a list of possible phoneme lists for a word."""
|
|
normalized = self.normalize_word(word)
|
|
return tuple(tuple(phones) for phones in self.dict.get(normalized, []))
|
|
|
|
# Singleton instance
|
|
processor = PhoneticProcessor()
|