using System.Globalization; using System.IO.Compression; using LyricFlow.Core.Dtos; namespace LyricFlow.Core.Services; public class WordNetLexicon { private readonly string? _wordNetPath; private readonly Dictionary> _index = new(StringComparer.OrdinalIgnoreCase); private readonly Dictionary<(char Pos, long Offset), Synset> _synsets = new(); private bool _loaded; // MARK: - Lifecycle #region Lifecycle public WordNetLexicon(string? wordNetPath) { _wordNetPath = wordNetPath; } public bool IsAvailable { get { EnsureLoaded(); return _index.Count > 0; } } #endregion // MARK: - Public Queries #region Public Queries public bool ContainsWord(string word) { EnsureLoaded(); return _index.ContainsKey(NormalizeLemma(word)); } public SynonymResponseDto FindSynonyms(string word, int limit = 15) { EnsureLoaded(); var normalized = NormalizeLemma(word); if (!_index.TryGetValue(normalized, out var refs)) { return new SynonymResponseDto([], []); } var synonyms = new SortedSet(StringComparer.OrdinalIgnoreCase); var vibe = new SortedSet(StringComparer.OrdinalIgnoreCase); foreach (var synsetRef in refs) { if (!TryGetSynset(synsetRef.Pos, synsetRef.Offset, out var synset)) { continue; } foreach (var lemma in synset.Words) { var name = lemma.Replace('_', ' '); if (!name.Equals(word, StringComparison.OrdinalIgnoreCase)) { synonyms.Add(name); } } foreach (var pointer in synset.Pointers.Where(pointer => pointer.Symbol.StartsWith("@", StringComparison.Ordinal))) { if (!TryGetSynset(pointer.TargetPos, pointer.TargetOffset, out var target)) { continue; } foreach (var lemma in target.Words) { vibe.Add(lemma.Replace('_', ' ')); } } } return new SynonymResponseDto(synonyms.Take(limit).ToList(), vibe.Take(limit).ToList()); } #endregion // MARK: - Corpus Loading #region Corpus Loading private void EnsureLoaded() { if (_loaded) { return; } _loaded = true; if (string.IsNullOrWhiteSpace(_wordNetPath)) { return; } LoadIndexFile('n', "index.noun"); LoadIndexFile('v', "index.verb"); LoadIndexFile('a', "index.adj"); LoadIndexFile('r', "index.adv"); LoadDataFile("data.noun"); LoadDataFile("data.verb"); LoadDataFile("data.adj"); LoadDataFile("data.adv"); } private void LoadIndexFile(char pos, string relativePath) { foreach (var line in ReadCorpusLines(relativePath)) { if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0])) { continue; } var tokens = line.Split(' ', StringSplitOptions.RemoveEmptyEntries); if (tokens.Length < 6 || !int.TryParse(tokens[2], out var synsetCount) || !int.TryParse(tokens[3], out var pointerCount)) { continue; } var offsetStart = 6 + pointerCount; if (tokens.Length < offsetStart + synsetCount) { continue; } var lemma = NormalizeLemma(tokens[0]); if (!_index.TryGetValue(lemma, out var refs)) { refs = []; _index[lemma] = refs; } for (var i = 0; i < synsetCount; i++) { if (long.TryParse(tokens[offsetStart + i], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset)) { refs.Add(new SynsetRef(pos, offset)); } } } } private void LoadDataFile(string relativePath) { foreach (var line in ReadCorpusLines(relativePath)) { if (string.IsNullOrWhiteSpace(line) || char.IsWhiteSpace(line[0])) { continue; } var data = line.Split('|', 2)[0].Trim(); var tokens = data.Split(' ', StringSplitOptions.RemoveEmptyEntries); if (tokens.Length < 5 || !long.TryParse(tokens[0], NumberStyles.Integer, CultureInfo.InvariantCulture, out var offset)) { continue; } var synsetType = tokens[2][0]; if (!int.TryParse(tokens[3], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var wordCount)) { continue; } var index = 4; var words = new List(wordCount); for (var i = 0; i < wordCount && index + 1 < tokens.Length; i++) { words.Add(tokens[index].ToLowerInvariant()); index += 2; } if (index >= tokens.Length || !int.TryParse(tokens[index], out var pointerCount)) { continue; } index++; var pointers = new List(pointerCount); for (var i = 0; i < pointerCount && index + 3 < tokens.Length; i++) { if (long.TryParse(tokens[index + 1], NumberStyles.Integer, CultureInfo.InvariantCulture, out var targetOffset)) { pointers.Add(new SynsetPointer(tokens[index], targetOffset, tokens[index + 2][0])); } index += 4; } var synset = new Synset(words, pointers); _synsets[(synsetType, offset)] = synset; if (synsetType == 's') { _synsets[('a', offset)] = synset; } } } #endregion // MARK: - Synset Resolution #region Synset Resolution private bool TryGetSynset(char pos, long offset, out Synset synset) { if (_synsets.TryGetValue((pos, offset), out synset!)) { return true; } if (pos == 'a' && _synsets.TryGetValue(('s', offset), out synset!)) { return true; } synset = null!; return false; } #endregion // MARK: - Corpus Readers #region Corpus Readers private IEnumerable ReadCorpusLines(string relativePath) { if (string.IsNullOrWhiteSpace(_wordNetPath)) { yield break; } if (Directory.Exists(_wordNetPath)) { var path = Path.Combine(_wordNetPath, relativePath); if (!File.Exists(path)) { yield break; } foreach (var line in File.ReadLines(path)) { yield return line; } yield break; } if (!File.Exists(_wordNetPath) || !string.Equals(Path.GetExtension(_wordNetPath), ".zip", StringComparison.OrdinalIgnoreCase)) { yield break; } using var archive = ZipFile.OpenRead(_wordNetPath); var entry = archive.GetEntry($"wordnet/{relativePath}"); if (entry is null) { yield break; } using var stream = entry.Open(); using var reader = new StreamReader(stream); while (!reader.EndOfStream) { var line = reader.ReadLine(); if (line is not null) { yield return line; } } } #endregion // MARK: - Normalization And Records #region Normalization And Records private static string NormalizeLemma(string word) { return word.Trim().ToLowerInvariant().Replace(' ', '_'); } private sealed record SynsetRef(char Pos, long Offset); private sealed record Synset(List Words, List Pointers); private sealed record SynsetPointer(string Symbol, long TargetOffset, char TargetPos); #endregion }