using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using LyricFlow.Core.Dtos; using LyricFlow.Core.Services; namespace LyricFlow.Core.Engine; public class SpellcheckEngine { private readonly PhoneticProcessor _processor; private readonly WordNetLexicon _wordNet; private Dictionary>? _cmuByInitial; // MARK: - Lifecycle #region Lifecycle public SpellcheckEngine(PhoneticProcessor processor, WordNetLexicon wordNet) { _processor = processor; _wordNet = wordNet; } #endregion // MARK: - Dictionary Index #region Dictionary Index private Dictionary> CmuWordsByInitial { get { if (_cmuByInitial == null) { _cmuByInitial = new Dictionary>(); foreach (var word in _processor.Dictionary.Keys) { if (string.IsNullOrEmpty(word)) continue; char initial = word[0]; if (!_cmuByInitial.ContainsKey(initial)) _cmuByInitial[initial] = new List(); _cmuByInitial[initial].Add(word); } } return _cmuByInitial; } } #endregion // MARK: - Suggestion Queries #region Suggestion Queries public bool IsKnownWord(string word) { var normalized = _processor.NormalizeWord(word); if (string.IsNullOrEmpty(normalized)) return true; return _processor.Dictionary.ContainsKey(normalized) || _wordNet.ContainsWord(normalized); } public List GetSpellingSuggestions(string word, int limit = 6) { var normalized = _processor.NormalizeWord(word); if (string.IsNullOrEmpty(normalized) || IsKnownWord(normalized)) return new List(); char initial = normalized[0]; if (!CmuWordsByInitial.TryGetValue(initial, out var candidates)) { candidates = _processor.Dictionary.Keys.ToList(); } var lengthFiltered = candidates.Where(w => Math.Abs(w.Length - normalized.Length) <= 3).ToList(); if (lengthFiltered.Count == 0) lengthFiltered = candidates; return GetCloseMatches(normalized, lengthFiltered, limit, 0.75); } private List GetCloseMatches(string word, List possibilities, int n, double cutoff) { var scored = new List<(int HeuristicRank, int Distance, double Similarity, int LengthDelta, int SharedPrefix, int SharedSuffix, string Match)>(); foreach (var p in possibilities) { double ratio = CalculateSimilarityRatio(word, p); if (ratio >= cutoff) { scored.Add(( HeuristicRank(word, p), DamerauLevenshteinDistance(word, p), SequenceSimilarity(word, p), Math.Abs(word.Length - p.Length), SharedPrefixLength(word, p), SharedSuffixLength(word, p), p )); } } return scored .OrderBy(item => item.HeuristicRank) .ThenBy(item => item.Distance) .ThenByDescending(item => item.Similarity) .ThenBy(item => item.LengthDelta) .ThenByDescending(item => item.SharedPrefix) .ThenByDescending(item => item.SharedSuffix) .ThenBy(item => item.Match, StringComparer.Ordinal) .Take(n) .Select(item => item.Match) .ToList(); } private double CalculateSimilarityRatio(string a, string b) { int distance = DamerauLevenshteinDistance(a, b); int totalLen = a.Length + b.Length; if (totalLen == 0) return 1.0; return (double)(totalLen - distance) / totalLen; } #endregion // MARK: - Similarity Helpers #region Similarity Helpers public static int DamerauLevenshteinDistance(string a, string b) { if (a == b) return 0; if (string.IsNullOrEmpty(a)) return b.Length; if (string.IsNullOrEmpty(b)) return a.Length; var da = new Dictionary(); foreach (var ch in a.Concat(b)) { if (!da.ContainsKey(ch)) { da[ch] = 0; } } int maxDistance = a.Length + b.Length; int[,] d = new int[a.Length + 2, b.Length + 2]; d[0, 0] = maxDistance; for (int i = 0; i <= a.Length; i++) { d[i + 1, 0] = maxDistance; d[i + 1, 1] = i; } for (int j = 0; j <= b.Length; j++) { d[0, j + 1] = maxDistance; d[1, j + 1] = j; } for (int i = 1; i <= a.Length; i++) { int db = 0; for (int j = 1; j <= b.Length; j++) { int i1 = da[b[j - 1]]; int j1 = db; int cost = 1; if (a[i - 1] == b[j - 1]) { cost = 0; db = j; } d[i + 1, j + 1] = Math.Min( Math.Min( d[i, j] + cost, d[i + 1, j] + 1 ), Math.Min( d[i, j + 1] + 1, d[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1) ) ); } da[a[i - 1]] = i; } return d[a.Length + 1, b.Length + 1]; } public static int LevenshteinDistance(string a, string b) { if (a == b) return 0; if (string.IsNullOrEmpty(a)) return b.Length; if (string.IsNullOrEmpty(b)) return a.Length; int[] prevRow = new int[b.Length + 1]; for (int i = 0; i <= b.Length; i++) prevRow[i] = i; for (int i = 1; i <= a.Length; i++) { int[] row = new int[b.Length + 1]; row[0] = i; for (int j = 1; j <= b.Length; j++) { int insertCost = row[j - 1] + 1; int deleteCost = prevRow[j] + 1; int replaceCost = prevRow[j - 1] + (a[i - 1] == b[j - 1] ? 0 : 1); row[j] = Math.Min(Math.Min(insertCost, deleteCost), replaceCost); } prevRow = row; } return prevRow[b.Length]; } private static int SharedPrefixLength(string a, string b) { int limit = Math.Min(a.Length, b.Length); int count = 0; while (count < limit && a[count] == b[count]) { count++; } return count; } private static int SharedSuffixLength(string a, string b) { int count = 0; while ( count < a.Length && count < b.Length && a[a.Length - 1 - count] == b[b.Length - 1 - count] ) { count++; } return count; } private static double SequenceSimilarity(string a, string b) { int lcs = LongestCommonSubsequenceLength(a, b); int total = a.Length + b.Length; if (total == 0) { return 1.0; } return (2.0 * lcs) / total; } private static int LongestCommonSubsequenceLength(string a, string b) { int[,] dp = new int[a.Length + 1, b.Length + 1]; for (int i = 1; i <= a.Length; i++) { for (int j = 1; j <= b.Length; j++) { if (a[i - 1] == b[j - 1]) { dp[i, j] = dp[i - 1, j - 1] + 1; } else { dp[i, j] = Math.Max(dp[i - 1, j], dp[i, j - 1]); } } } return dp[a.Length, b.Length]; } private static int HeuristicRank(string source, string candidate) { if (IsAdjacentTransposition(source, candidate)) { return 0; } if (IsRepeatedLetterExpansion(source, candidate)) { return 1; } return 2; } private static bool IsAdjacentTransposition(string source, string candidate) { if (source.Length != candidate.Length) { return false; } for (int i = 0; i < source.Length - 1; i++) { if (source[i] == candidate[i]) { continue; } return source[i] == candidate[i + 1] && source[i + 1] == candidate[i] && source[(i + 2)..] == candidate[(i + 2)..] && source[..i] == candidate[..i]; } return false; } private static bool IsRepeatedLetterExpansion(string source, string candidate) { if (candidate.Length != source.Length + 1) { return false; } for (int i = 0; i < candidate.Length - 1; i++) { if (candidate[i] != candidate[i + 1]) { continue; } var collapsed = candidate.Remove(i, 1); if (collapsed == source) { return true; } } return false; } #endregion // MARK: - Autocorrect #region Autocorrect public string? GetAutocorrectCandidate(string word, double minRatio = 0.75, int maxEditDistance = 2) { var normalized = _processor.NormalizeWord(word); if (string.IsNullOrEmpty(normalized) || normalized.Length < 3 || IsKnownWord(normalized)) { return null; } var suggestions = GetSpellingSuggestions(normalized, 3); if (suggestions.Count == 0) { return null; } var scored = new List<(double Ratio, int LexicalRank, int ApostrophePenalty, int LengthDelta, int Distance, string Word)>(); foreach (var candidate in suggestions) { var ratio = CalculateSimilarityRatio(normalized, candidate); var distance = DamerauLevenshteinDistance(normalized, candidate); if (ratio < minRatio || distance > maxEditDistance) { continue; } scored.Add(( ratio, _wordNet.ContainsWord(candidate) ? 1 : 0, candidate.Contains('\'') ? 1 : 0, Math.Abs(candidate.Length - normalized.Length), distance, candidate )); } var ranked = scored .OrderByDescending(item => item.LexicalRank) .ThenBy(item => item.ApostrophePenalty) .ThenBy(item => item.LengthDelta) .ThenBy(item => item.Distance) .ThenByDescending(item => item.Ratio) .ToList(); if (ranked.Count == 0) { return null; } var best = ranked[0]; if (normalized.EndsWith("ign", StringComparison.Ordinal)) { var ingCandidate = ranked .Where(item => item.Word.EndsWith("ing", StringComparison.Ordinal)) .OrderByDescending(item => item.LexicalRank) .ThenBy(item => item.ApostrophePenalty) .ThenBy(item => item.LengthDelta) .ThenBy(item => item.Distance) .ThenByDescending(item => item.Ratio) .FirstOrDefault(); if (!string.IsNullOrWhiteSpace(ingCandidate.Word)) { return ingCandidate.Word; } } var exactLength = ranked .Where(item => item.LengthDelta == 0) .OrderByDescending(item => item.LexicalRank) .ThenBy(item => item.ApostrophePenalty) .ThenBy(item => item.Distance) .ThenByDescending(item => item.Ratio) .FirstOrDefault(); if (!string.IsNullOrWhiteSpace(exactLength.Word)) { return exactLength.Word; } return best.Word; } #endregion // MARK: - Text Analysis #region Text Analysis public List GetTextSpellingIssues(string text, int suggestionLimit = 6) { var issues = new List(); var lines = text.Split('\n'); for (int i = 0; i < lines.Length; i++) { var line = lines[i].Trim(); if (line.StartsWith("#") || line.StartsWith("@") || line.StartsWith(">")) continue; // Remove tags [tag] var analysisText = Regex.Replace(line, @"\[.*?\]", ""); var words = Regex.Matches(analysisText, @"\b\w+\b"); foreach (Match match in words) { var rawWord = match.Value; var normalized = _processor.NormalizeWord(rawWord); if (string.IsNullOrEmpty(normalized)) continue; if (IsKnownWord(normalized)) continue; issues.Add(new SpellingIssueDto( rawWord, normalized, i, GetSpellingSuggestions(normalized, suggestionLimit) )); } } return issues; } #endregion }