using System.Text.RegularExpressions; using System.IO; using System.Collections.Generic; using System.Linq; namespace LyricFlow.Core.Engine; public class PhoneticProcessor { private readonly Dictionary>> _dictionary = new(); private static readonly Regex WordCleanupRegex = new(@"[^a-z']", RegexOptions.Compiled | RegexOptions.IgnoreCase); public PhoneticProcessor(string? cmudictPath) { if (!string.IsNullOrWhiteSpace(cmudictPath)) { LoadCmuDict(cmudictPath); } } private void LoadCmuDict(string path) { if (!File.Exists(path)) return; foreach (var line in File.ReadLines(path)) { if (string.IsNullOrWhiteSpace(line) || line.StartsWith(";;;")) continue; var parts = line.Split(' ', StringSplitOptions.RemoveEmptyEntries); if (parts.Length < 3) continue; var word = parts[0].ToLower(); // The format from NLTK has [WORD] [VARIATION_ID] [PH1] [PH2]... var phonemes = parts.Skip(2).ToList(); if (!_dictionary.TryGetValue(word, out var variations)) { variations = new List>(); _dictionary[word] = variations; } variations.Add(phonemes); } } public string NormalizeWord(string word) { word = word.ToLower().Trim(); word = WordCleanupRegex.Replace(word, ""); if (word.EndsWith("in'")) { word = word[..^1] + "g"; } return word; } public List> GetPhonemes(string word) { var normalized = NormalizeWord(word); if (_dictionary.TryGetValue(normalized, out var phones)) { return phones; } return new List>(); } public Dictionary>> Dictionary => _dictionary; }