using System.Net; using System.Text.RegularExpressions; namespace Journal.Core.Services.Entries; public static class HtmlSanitizer { public static string StripRichHtml(string content) { if (string.IsNullOrWhiteSpace(content)) return content; if (!LooksLikeRichHtml(content)) return content; var text = content.Replace("\r\n", "\n").Replace("\r", "\n"); text = Regex.Replace(text, "<(script|style)\\b[^>]*>.*?", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); text = Regex.Replace(text, "]*>", "\n- ", RegexOptions.IgnoreCase); text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); text = Regex.Replace(text, "<(td|th)\\b[^>]*>", " | ", RegexOptions.IgnoreCase); text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); text = Regex.Replace(text, "]*>", "\n---\n", RegexOptions.IgnoreCase); text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Singleline); text = WebUtility.HtmlDecode(text) .Replace('\u00a0', ' ') .Replace("\u200b", "", StringComparison.Ordinal); text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd())); text = Regex.Replace(text, "[ \\t]{2,}", " "); text = Regex.Replace(text, "\n{3,}", "\n\n").Trim(); return string.IsNullOrEmpty(text) ? content : text; } public static bool LooksLikeRichHtml(string content) { var lowered = content.ToLowerInvariant(); string[] markers = [ "", " lowered.Contains(marker, StringComparison.Ordinal))) return true; return Regex.Matches(lowered, "]*>").Count >= 8; } }