stan44 e0f298ba36 Add LyricFlow .NET backend API and Python bridge integration
- introduce `LyricFlow.Core.Backend` with shared DTOs, rhyme/spellcheck engines, and REST endpoints
- wire Python GUI/core to run and call the backend via new bridge/client modules
- add backend parity/integration tests and update packaging/ignore settings
2026-03-15 01:44:56 -05:00

299 lines
8.2 KiB
C#

using System.Globalization;
using System.IO.Compression;
using LyricFlow.Core.Dtos;
namespace LyricFlow.Core.Services;
public class WordNetLexicon
{
private readonly string? _wordNetPath;
private readonly Dictionary<string, List<SynsetRef>> _index = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<(char Pos, long Offset), Synset> _synsets = new();
private bool _loaded;
// MARK: - Lifecycle
#region Lifecycle
public WordNetLexicon(string? wordNetPath)
{
_wordNetPath = wordNetPath;
}
public bool IsAvailable
{
get
{
EnsureLoaded();
return _index.Count > 0;
}
}
#endregion
// MARK: - Public Queries
#region Public Queries
public bool ContainsWord(string word)
{
EnsureLoaded();
return _index.ContainsKey(NormalizeLemma(word));
}
public SynonymResponseDto FindSynonyms(string word, int limit = 15)
{
EnsureLoaded();
var normalized = NormalizeLemma(word);
if (!_index.TryGetValue(normalized, out var refs))
{
return new SynonymResponseDto([], []);
}
var synonyms = new SortedSet<string>(StringComparer.OrdinalIgnoreCase);
var vibe = new SortedSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (var synsetRef in refs)
{
if (!TryGetSynset(synsetRef.Pos, synsetRef.Offset, out var synset))
{
continue;
}
foreach (var lemma in synset.Words)
{
var name = lemma.Replace('_', ' ');
if (!name.Equals(word, StringComparison.OrdinalIgnoreCase))
{
synonyms.Add(name);
}
}
foreach (var pointer in synset.Pointers.Where(pointer => pointer.Symbol.StartsWith("@", StringComparison.Ordinal)))
{
if (!TryGetSynset(pointer.TargetPos, pointer.TargetOffset, out var target))
{
continue;
}
foreach (var lemma in target.Words)
{
vibe.Add(lemma.Replace('_', ' '));
}
}
}
return new SynonymResponseDto(synonyms.Take(limit).ToList(), vibe.Take(limit).ToList());
}
#endregion
// MARK: - Corpus Loading
#region Corpus Loading
private void EnsureLoaded()
{
if (_loaded)
{
return;
}
_loaded = true;
if (string.IsNullOrWhiteSpace(_wordNetPath))
{
return;
}
LoadIndexFile('n', "index.noun");
LoadIndexFile('v', "index.verb");
LoadIndexFile('a', "index.adj");
LoadIndexFile('r', "index.adv");
LoadDataFile("data.noun");
LoadDataFile("data.verb");
LoadDataFile("data.adj");
LoadDataFile("data.adv");
}
private void LoadIndexFile(char pos, string relativePath)
{
foreach (var line in ReadCorpusLines(relativePath))
{
if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0]))
{
continue;
}
var tokens = line.Split(' ', StringSplitOptions.RemoveEmptyEntries);
if (tokens.Length < 6 || !int.TryParse(tokens[2], out var synsetCount) || !int.TryParse(tokens[3], out var pointerCount))
{
continue;
}
var offsetStart = 6 + pointerCount;
if (tokens.Length < offsetStart + synsetCount)
{
continue;
}
var lemma = NormalizeLemma(tokens[0]);
if (!_index.TryGetValue(lemma, out var refs))
{
refs = [];
_index[lemma] = refs;
}
for (var i = 0; i < synsetCount; i++)
{
if (long.TryParse(tokens[offsetStart + i], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset))
{
refs.Add(new SynsetRef(pos, offset));
}
}
}
}
private void LoadDataFile(string relativePath)
{
foreach (var line in ReadCorpusLines(relativePath))
{
if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0]))
{
continue;
}
var data = line.Split('|', 2)[0].Trim();
var tokens = data.Split(' ', StringSplitOptions.RemoveEmptyEntries);
if (tokens.Length < 5 || !long.TryParse(tokens[0], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset))
{
continue;
}
var synsetType = tokens[2][0];
if (!int.TryParse(tokens[3], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var wordCount))
{
continue;
}
var index = 4;
var words = new List<string>(wordCount);
for (var i = 0; i < wordCount && index + 1 < tokens.Length; i++)
{
words.Add(tokens[index].ToLowerInvariant());
index += 2;
}
if (index >= tokens.Length || !int.TryParse(tokens[index], out var pointerCount))
{
continue;
}
index++;
var pointers = new List<SynsetPointer>(pointerCount);
for (var i = 0; i < pointerCount && index + 3 < tokens.Length; i++)
{
if (long.TryParse(tokens[index + 1], NumberStyles.Integer, CultureInfo.InvariantCulture, out var targetOffset))
{
pointers.Add(new SynsetPointer(tokens[index], targetOffset, tokens[index + 2][0]));
}
index += 4;
}
var synset = new Synset(words, pointers);
_synsets[(synsetType, offset)] = synset;
if (synsetType == 's')
{
_synsets[('a', offset)] = synset;
}
}
}
#endregion
// MARK: - Synset Resolution
#region Synset Resolution
private bool TryGetSynset(char pos, long offset, out Synset synset)
{
if (_synsets.TryGetValue((pos, offset), out synset!))
{
return true;
}
if (pos == 'a' && _synsets.TryGetValue(('s', offset), out synset!))
{
return true;
}
synset = null!;
return false;
}
#endregion
// MARK: - Corpus Readers
#region Corpus Readers
private IEnumerable<string> ReadCorpusLines(string relativePath)
{
if (string.IsNullOrWhiteSpace(_wordNetPath))
{
yield break;
}
if (Directory.Exists(_wordNetPath))
{
var path = Path.Combine(_wordNetPath, relativePath);
if (!File.Exists(path))
{
yield break;
}
foreach (var line in File.ReadLines(path))
{
yield return line;
}
yield break;
}
if (!File.Exists(_wordNetPath) || !string.Equals(Path.GetExtension(_wordNetPath), ".zip", StringComparison.OrdinalIgnoreCase))
{
yield break;
}
using var archive = ZipFile.OpenRead(_wordNetPath);
var entry = archive.GetEntry($"wordnet/{relativePath}");
if (entry is null)
{
yield break;
}
using var stream = entry.Open();
using var reader = new StreamReader(stream);
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
if (line is not null)
{
yield return line;
}
}
}
#endregion
// MARK: - Normalization And Records
#region Normalization And Records
private static string NormalizeLemma(string word)
{
return word.Trim().ToLowerInvariant().Replace(' ', '_');
}
private sealed record SynsetRef(char Pos, long Offset);
private sealed record Synset(List<string> Words, List<SynsetPointer> Pointers);
private sealed record SynsetPointer(string Symbol, long TargetOffset, char TargetPos);
#endregion
}