- introduce `LyricFlow.Core.Backend` with shared DTOs, rhyme/spellcheck engines, and REST endpoints - wire Python GUI/core to run and call the backend via new bridge/client modules - add backend parity/integration tests and update packaging/ignore settings
299 lines
8.2 KiB
C#
299 lines
8.2 KiB
C#
using System.Globalization;
|
|
using System.IO.Compression;
|
|
using LyricFlow.Core.Dtos;
|
|
|
|
namespace LyricFlow.Core.Services;
|
|
|
|
public class WordNetLexicon
|
|
{
|
|
private readonly string? _wordNetPath;
|
|
private readonly Dictionary<string, List<SynsetRef>> _index = new(StringComparer.OrdinalIgnoreCase);
|
|
private readonly Dictionary<(char Pos, long Offset), Synset> _synsets = new();
|
|
private bool _loaded;
|
|
|
|
// MARK: - Lifecycle
|
|
#region Lifecycle
|
|
|
|
public WordNetLexicon(string? wordNetPath)
|
|
{
|
|
_wordNetPath = wordNetPath;
|
|
}
|
|
|
|
public bool IsAvailable
|
|
{
|
|
get
|
|
{
|
|
EnsureLoaded();
|
|
return _index.Count > 0;
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Public Queries
|
|
#region Public Queries
|
|
|
|
public bool ContainsWord(string word)
|
|
{
|
|
EnsureLoaded();
|
|
return _index.ContainsKey(NormalizeLemma(word));
|
|
}
|
|
|
|
public SynonymResponseDto FindSynonyms(string word, int limit = 15)
|
|
{
|
|
EnsureLoaded();
|
|
var normalized = NormalizeLemma(word);
|
|
if (!_index.TryGetValue(normalized, out var refs))
|
|
{
|
|
return new SynonymResponseDto([], []);
|
|
}
|
|
|
|
var synonyms = new SortedSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var vibe = new SortedSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
|
|
foreach (var synsetRef in refs)
|
|
{
|
|
if (!TryGetSynset(synsetRef.Pos, synsetRef.Offset, out var synset))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
foreach (var lemma in synset.Words)
|
|
{
|
|
var name = lemma.Replace('_', ' ');
|
|
if (!name.Equals(word, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
synonyms.Add(name);
|
|
}
|
|
}
|
|
|
|
foreach (var pointer in synset.Pointers.Where(pointer => pointer.Symbol.StartsWith("@", StringComparison.Ordinal)))
|
|
{
|
|
if (!TryGetSynset(pointer.TargetPos, pointer.TargetOffset, out var target))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
foreach (var lemma in target.Words)
|
|
{
|
|
vibe.Add(lemma.Replace('_', ' '));
|
|
}
|
|
}
|
|
}
|
|
|
|
return new SynonymResponseDto(synonyms.Take(limit).ToList(), vibe.Take(limit).ToList());
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Corpus Loading
|
|
#region Corpus Loading
|
|
|
|
private void EnsureLoaded()
|
|
{
|
|
if (_loaded)
|
|
{
|
|
return;
|
|
}
|
|
|
|
_loaded = true;
|
|
if (string.IsNullOrWhiteSpace(_wordNetPath))
|
|
{
|
|
return;
|
|
}
|
|
|
|
LoadIndexFile('n', "index.noun");
|
|
LoadIndexFile('v', "index.verb");
|
|
LoadIndexFile('a', "index.adj");
|
|
LoadIndexFile('r', "index.adv");
|
|
|
|
LoadDataFile("data.noun");
|
|
LoadDataFile("data.verb");
|
|
LoadDataFile("data.adj");
|
|
LoadDataFile("data.adv");
|
|
}
|
|
|
|
private void LoadIndexFile(char pos, string relativePath)
|
|
{
|
|
foreach (var line in ReadCorpusLines(relativePath))
|
|
{
|
|
if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0]))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var tokens = line.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
if (tokens.Length < 6 || !int.TryParse(tokens[2], out var synsetCount) || !int.TryParse(tokens[3], out var pointerCount))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var offsetStart = 6 + pointerCount;
|
|
if (tokens.Length < offsetStart + synsetCount)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var lemma = NormalizeLemma(tokens[0]);
|
|
if (!_index.TryGetValue(lemma, out var refs))
|
|
{
|
|
refs = [];
|
|
_index[lemma] = refs;
|
|
}
|
|
|
|
for (var i = 0; i < synsetCount; i++)
|
|
{
|
|
if (long.TryParse(tokens[offsetStart + i], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset))
|
|
{
|
|
refs.Add(new SynsetRef(pos, offset));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void LoadDataFile(string relativePath)
|
|
{
|
|
foreach (var line in ReadCorpusLines(relativePath))
|
|
{
|
|
if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0]))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var data = line.Split('|', 2)[0].Trim();
|
|
var tokens = data.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
if (tokens.Length < 5 || !long.TryParse(tokens[0], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var synsetType = tokens[2][0];
|
|
if (!int.TryParse(tokens[3], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var wordCount))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var index = 4;
|
|
var words = new List<string>(wordCount);
|
|
for (var i = 0; i < wordCount && index + 1 < tokens.Length; i++)
|
|
{
|
|
words.Add(tokens[index].ToLowerInvariant());
|
|
index += 2;
|
|
}
|
|
|
|
if (index >= tokens.Length || !int.TryParse(tokens[index], out var pointerCount))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
index++;
|
|
var pointers = new List<SynsetPointer>(pointerCount);
|
|
for (var i = 0; i < pointerCount && index + 3 < tokens.Length; i++)
|
|
{
|
|
if (long.TryParse(tokens[index + 1], NumberStyles.Integer, CultureInfo.InvariantCulture, out var targetOffset))
|
|
{
|
|
pointers.Add(new SynsetPointer(tokens[index], targetOffset, tokens[index + 2][0]));
|
|
}
|
|
|
|
index += 4;
|
|
}
|
|
|
|
var synset = new Synset(words, pointers);
|
|
_synsets[(synsetType, offset)] = synset;
|
|
if (synsetType == 's')
|
|
{
|
|
_synsets[('a', offset)] = synset;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Synset Resolution
|
|
#region Synset Resolution
|
|
|
|
private bool TryGetSynset(char pos, long offset, out Synset synset)
|
|
{
|
|
if (_synsets.TryGetValue((pos, offset), out synset!))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (pos == 'a' && _synsets.TryGetValue(('s', offset), out synset!))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
synset = null!;
|
|
return false;
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Corpus Readers
|
|
#region Corpus Readers
|
|
|
|
private IEnumerable<string> ReadCorpusLines(string relativePath)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(_wordNetPath))
|
|
{
|
|
yield break;
|
|
}
|
|
|
|
if (Directory.Exists(_wordNetPath))
|
|
{
|
|
var path = Path.Combine(_wordNetPath, relativePath);
|
|
if (!File.Exists(path))
|
|
{
|
|
yield break;
|
|
}
|
|
|
|
foreach (var line in File.ReadLines(path))
|
|
{
|
|
yield return line;
|
|
}
|
|
|
|
yield break;
|
|
}
|
|
|
|
if (!File.Exists(_wordNetPath) || !string.Equals(Path.GetExtension(_wordNetPath), ".zip", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
yield break;
|
|
}
|
|
|
|
using var archive = ZipFile.OpenRead(_wordNetPath);
|
|
var entry = archive.GetEntry($"wordnet/{relativePath}");
|
|
if (entry is null)
|
|
{
|
|
yield break;
|
|
}
|
|
|
|
using var stream = entry.Open();
|
|
using var reader = new StreamReader(stream);
|
|
while (!reader.EndOfStream)
|
|
{
|
|
var line = reader.ReadLine();
|
|
if (line is not null)
|
|
{
|
|
yield return line;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
// MARK: - Normalization And Records
|
|
#region Normalization And Records
|
|
|
|
private static string NormalizeLemma(string word)
|
|
{
|
|
return word.Trim().ToLowerInvariant().Replace(' ', '_');
|
|
}
|
|
|
|
private sealed record SynsetRef(char Pos, long Offset);
|
|
private sealed record Synset(List<string> Words, List<SynsetPointer> Pointers);
|
|
private sealed record SynsetPointer(string Symbol, long TargetOffset, char TargetPos);
|
|
|
|
#endregion
|
|
}
|