- introduce `LyricFlow.Core.Backend` with shared DTOs, rhyme/spellcheck engines, and REST endpoints - wire Python GUI/core to run and call the backend via new bridge/client modules - add backend parity/integration tests and update packaging/ignore settings
456 lines
13 KiB
C#
456 lines
13 KiB
C#
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text.RegularExpressions;
|
|
using LyricFlow.Core.Dtos;
|
|
using LyricFlow.Core.Services;
|
|
|
|
namespace LyricFlow.Core.Engine;
|
|
|
|
public class SpellcheckEngine
|
|
{
|
|
private readonly PhoneticProcessor _processor;
|
|
private readonly WordNetLexicon _wordNet;
|
|
private Dictionary<char, List<string>>? _cmuByInitial;
|
|
|
|
// MARK: - Lifecycle
|
|
#region Lifecycle
|
|
|
|
public SpellcheckEngine(PhoneticProcessor processor, WordNetLexicon wordNet)
|
|
{
|
|
_processor = processor;
|
|
_wordNet = wordNet;
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Dictionary Index
|
|
#region Dictionary Index
|
|
|
|
private Dictionary<char, List<string>> CmuWordsByInitial
|
|
{
|
|
get
|
|
{
|
|
if (_cmuByInitial == null)
|
|
{
|
|
_cmuByInitial = new Dictionary<char, List<string>>();
|
|
foreach (var word in _processor.Dictionary.Keys)
|
|
{
|
|
if (string.IsNullOrEmpty(word)) continue;
|
|
char initial = word[0];
|
|
if (!_cmuByInitial.ContainsKey(initial)) _cmuByInitial[initial] = new List<string>();
|
|
_cmuByInitial[initial].Add(word);
|
|
}
|
|
}
|
|
return _cmuByInitial;
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Suggestion Queries
|
|
#region Suggestion Queries
|
|
|
|
public bool IsKnownWord(string word)
|
|
{
|
|
var normalized = _processor.NormalizeWord(word);
|
|
if (string.IsNullOrEmpty(normalized)) return true;
|
|
return _processor.Dictionary.ContainsKey(normalized) || _wordNet.ContainsWord(normalized);
|
|
}
|
|
|
|
public List<string> GetSpellingSuggestions(string word, int limit = 6)
|
|
{
|
|
var normalized = _processor.NormalizeWord(word);
|
|
if (string.IsNullOrEmpty(normalized) || IsKnownWord(normalized)) return new List<string>();
|
|
|
|
char initial = normalized[0];
|
|
if (!CmuWordsByInitial.TryGetValue(initial, out var candidates))
|
|
{
|
|
candidates = _processor.Dictionary.Keys.ToList();
|
|
}
|
|
|
|
var lengthFiltered = candidates.Where(w => Math.Abs(w.Length - normalized.Length) <= 3).ToList();
|
|
if (lengthFiltered.Count == 0) lengthFiltered = candidates;
|
|
|
|
return GetCloseMatches(normalized, lengthFiltered, limit, 0.75);
|
|
}
|
|
|
|
private List<string> GetCloseMatches(string word, List<string> possibilities, int n, double cutoff)
|
|
{
|
|
var scored = new List<(int HeuristicRank, int Distance, double Similarity, int LengthDelta, int SharedPrefix, int SharedSuffix, string Match)>();
|
|
foreach (var p in possibilities)
|
|
{
|
|
double ratio = CalculateSimilarityRatio(word, p);
|
|
if (ratio >= cutoff)
|
|
{
|
|
scored.Add((
|
|
HeuristicRank(word, p),
|
|
DamerauLevenshteinDistance(word, p),
|
|
SequenceSimilarity(word, p),
|
|
Math.Abs(word.Length - p.Length),
|
|
SharedPrefixLength(word, p),
|
|
SharedSuffixLength(word, p),
|
|
p
|
|
));
|
|
}
|
|
}
|
|
|
|
return scored
|
|
.OrderBy(item => item.HeuristicRank)
|
|
.ThenBy(item => item.Distance)
|
|
.ThenByDescending(item => item.Similarity)
|
|
.ThenBy(item => item.LengthDelta)
|
|
.ThenByDescending(item => item.SharedPrefix)
|
|
.ThenByDescending(item => item.SharedSuffix)
|
|
.ThenBy(item => item.Match, StringComparer.Ordinal)
|
|
.Take(n)
|
|
.Select(item => item.Match)
|
|
.ToList();
|
|
}
|
|
|
|
private double CalculateSimilarityRatio(string a, string b)
|
|
{
|
|
int distance = DamerauLevenshteinDistance(a, b);
|
|
int totalLen = a.Length + b.Length;
|
|
if (totalLen == 0) return 1.0;
|
|
return (double)(totalLen - distance) / totalLen;
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Similarity Helpers
|
|
#region Similarity Helpers
|
|
|
|
public static int DamerauLevenshteinDistance(string a, string b)
|
|
{
|
|
if (a == b) return 0;
|
|
if (string.IsNullOrEmpty(a)) return b.Length;
|
|
if (string.IsNullOrEmpty(b)) return a.Length;
|
|
|
|
var da = new Dictionary<char, int>();
|
|
foreach (var ch in a.Concat(b))
|
|
{
|
|
if (!da.ContainsKey(ch))
|
|
{
|
|
da[ch] = 0;
|
|
}
|
|
}
|
|
|
|
int maxDistance = a.Length + b.Length;
|
|
int[,] d = new int[a.Length + 2, b.Length + 2];
|
|
d[0, 0] = maxDistance;
|
|
|
|
for (int i = 0; i <= a.Length; i++)
|
|
{
|
|
d[i + 1, 0] = maxDistance;
|
|
d[i + 1, 1] = i;
|
|
}
|
|
|
|
for (int j = 0; j <= b.Length; j++)
|
|
{
|
|
d[0, j + 1] = maxDistance;
|
|
d[1, j + 1] = j;
|
|
}
|
|
|
|
for (int i = 1; i <= a.Length; i++)
|
|
{
|
|
int db = 0;
|
|
for (int j = 1; j <= b.Length; j++)
|
|
{
|
|
int i1 = da[b[j - 1]];
|
|
int j1 = db;
|
|
int cost = 1;
|
|
if (a[i - 1] == b[j - 1])
|
|
{
|
|
cost = 0;
|
|
db = j;
|
|
}
|
|
|
|
d[i + 1, j + 1] = Math.Min(
|
|
Math.Min(
|
|
d[i, j] + cost,
|
|
d[i + 1, j] + 1
|
|
),
|
|
Math.Min(
|
|
d[i, j + 1] + 1,
|
|
d[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
|
|
)
|
|
);
|
|
}
|
|
|
|
da[a[i - 1]] = i;
|
|
}
|
|
|
|
return d[a.Length + 1, b.Length + 1];
|
|
}
|
|
|
|
public static int LevenshteinDistance(string a, string b)
|
|
{
|
|
if (a == b) return 0;
|
|
if (string.IsNullOrEmpty(a)) return b.Length;
|
|
if (string.IsNullOrEmpty(b)) return a.Length;
|
|
|
|
int[] prevRow = new int[b.Length + 1];
|
|
for (int i = 0; i <= b.Length; i++) prevRow[i] = i;
|
|
|
|
for (int i = 1; i <= a.Length; i++)
|
|
{
|
|
int[] row = new int[b.Length + 1];
|
|
row[0] = i;
|
|
for (int j = 1; j <= b.Length; j++)
|
|
{
|
|
int insertCost = row[j - 1] + 1;
|
|
int deleteCost = prevRow[j] + 1;
|
|
int replaceCost = prevRow[j - 1] + (a[i - 1] == b[j - 1] ? 0 : 1);
|
|
row[j] = Math.Min(Math.Min(insertCost, deleteCost), replaceCost);
|
|
}
|
|
prevRow = row;
|
|
}
|
|
return prevRow[b.Length];
|
|
}
|
|
|
|
private static int SharedPrefixLength(string a, string b)
|
|
{
|
|
int limit = Math.Min(a.Length, b.Length);
|
|
int count = 0;
|
|
while (count < limit && a[count] == b[count])
|
|
{
|
|
count++;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
private static int SharedSuffixLength(string a, string b)
|
|
{
|
|
int count = 0;
|
|
while (
|
|
count < a.Length &&
|
|
count < b.Length &&
|
|
a[a.Length - 1 - count] == b[b.Length - 1 - count]
|
|
)
|
|
{
|
|
count++;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
private static double SequenceSimilarity(string a, string b)
|
|
{
|
|
int lcs = LongestCommonSubsequenceLength(a, b);
|
|
int total = a.Length + b.Length;
|
|
if (total == 0)
|
|
{
|
|
return 1.0;
|
|
}
|
|
|
|
return (2.0 * lcs) / total;
|
|
}
|
|
|
|
private static int LongestCommonSubsequenceLength(string a, string b)
|
|
{
|
|
int[,] dp = new int[a.Length + 1, b.Length + 1];
|
|
for (int i = 1; i <= a.Length; i++)
|
|
{
|
|
for (int j = 1; j <= b.Length; j++)
|
|
{
|
|
if (a[i - 1] == b[j - 1])
|
|
{
|
|
dp[i, j] = dp[i - 1, j - 1] + 1;
|
|
}
|
|
else
|
|
{
|
|
dp[i, j] = Math.Max(dp[i - 1, j], dp[i, j - 1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return dp[a.Length, b.Length];
|
|
}
|
|
|
|
private static int HeuristicRank(string source, string candidate)
|
|
{
|
|
if (IsAdjacentTransposition(source, candidate))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
if (IsRepeatedLetterExpansion(source, candidate))
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
return 2;
|
|
}
|
|
|
|
private static bool IsAdjacentTransposition(string source, string candidate)
|
|
{
|
|
if (source.Length != candidate.Length)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < source.Length - 1; i++)
|
|
{
|
|
if (source[i] == candidate[i])
|
|
{
|
|
continue;
|
|
}
|
|
|
|
return source[i] == candidate[i + 1]
|
|
&& source[i + 1] == candidate[i]
|
|
&& source[(i + 2)..] == candidate[(i + 2)..]
|
|
&& source[..i] == candidate[..i];
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private static bool IsRepeatedLetterExpansion(string source, string candidate)
|
|
{
|
|
if (candidate.Length != source.Length + 1)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < candidate.Length - 1; i++)
|
|
{
|
|
if (candidate[i] != candidate[i + 1])
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var collapsed = candidate.Remove(i, 1);
|
|
if (collapsed == source)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Autocorrect
|
|
#region Autocorrect
|
|
|
|
public string? GetAutocorrectCandidate(string word, double minRatio = 0.75, int maxEditDistance = 2)
|
|
{
|
|
var normalized = _processor.NormalizeWord(word);
|
|
if (string.IsNullOrEmpty(normalized) || normalized.Length < 3 || IsKnownWord(normalized))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var suggestions = GetSpellingSuggestions(normalized, 3);
|
|
if (suggestions.Count == 0)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var scored = new List<(double Ratio, int LexicalRank, int ApostrophePenalty, int LengthDelta, int Distance, string Word)>();
|
|
foreach (var candidate in suggestions)
|
|
{
|
|
var ratio = CalculateSimilarityRatio(normalized, candidate);
|
|
var distance = DamerauLevenshteinDistance(normalized, candidate);
|
|
if (ratio < minRatio || distance > maxEditDistance)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
scored.Add((
|
|
ratio,
|
|
_wordNet.ContainsWord(candidate) ? 1 : 0,
|
|
candidate.Contains('\'') ? 1 : 0,
|
|
Math.Abs(candidate.Length - normalized.Length),
|
|
distance,
|
|
candidate
|
|
));
|
|
}
|
|
|
|
var ranked = scored
|
|
.OrderByDescending(item => item.LexicalRank)
|
|
.ThenBy(item => item.ApostrophePenalty)
|
|
.ThenBy(item => item.LengthDelta)
|
|
.ThenBy(item => item.Distance)
|
|
.ThenByDescending(item => item.Ratio)
|
|
.ToList();
|
|
|
|
if (ranked.Count == 0)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var best = ranked[0];
|
|
if (normalized.EndsWith("ign", StringComparison.Ordinal))
|
|
{
|
|
var ingCandidate = ranked
|
|
.Where(item => item.Word.EndsWith("ing", StringComparison.Ordinal))
|
|
.OrderByDescending(item => item.LexicalRank)
|
|
.ThenBy(item => item.ApostrophePenalty)
|
|
.ThenBy(item => item.LengthDelta)
|
|
.ThenBy(item => item.Distance)
|
|
.ThenByDescending(item => item.Ratio)
|
|
.FirstOrDefault();
|
|
if (!string.IsNullOrWhiteSpace(ingCandidate.Word))
|
|
{
|
|
return ingCandidate.Word;
|
|
}
|
|
}
|
|
|
|
var exactLength = ranked
|
|
.Where(item => item.LengthDelta == 0)
|
|
.OrderByDescending(item => item.LexicalRank)
|
|
.ThenBy(item => item.ApostrophePenalty)
|
|
.ThenBy(item => item.Distance)
|
|
.ThenByDescending(item => item.Ratio)
|
|
.FirstOrDefault();
|
|
if (!string.IsNullOrWhiteSpace(exactLength.Word))
|
|
{
|
|
return exactLength.Word;
|
|
}
|
|
|
|
return best.Word;
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Text Analysis
|
|
#region Text Analysis
|
|
|
|
public List<SpellingIssueDto> GetTextSpellingIssues(string text, int suggestionLimit = 6)
|
|
{
|
|
var issues = new List<SpellingIssueDto>();
|
|
var lines = text.Split('\n');
|
|
|
|
for (int i = 0; i < lines.Length; i++)
|
|
{
|
|
var line = lines[i].Trim();
|
|
if (line.StartsWith("#") || line.StartsWith("@") || line.StartsWith(">")) continue;
|
|
|
|
// Remove tags [tag]
|
|
var analysisText = Regex.Replace(line, @"\[.*?\]", "");
|
|
var words = Regex.Matches(analysisText, @"\b\w+\b");
|
|
|
|
foreach (Match match in words)
|
|
{
|
|
var rawWord = match.Value;
|
|
var normalized = _processor.NormalizeWord(rawWord);
|
|
if (string.IsNullOrEmpty(normalized)) continue;
|
|
if (IsKnownWord(normalized)) continue;
|
|
|
|
issues.Add(new SpellingIssueDto(
|
|
rawWord,
|
|
normalized,
|
|
i,
|
|
GetSpellingSuggestions(normalized, suggestionLimit)
|
|
));
|
|
}
|
|
}
|
|
return issues;
|
|
}
|
|
|
|
#endregion
|
|
}
|