using System.Net; using System.Text.RegularExpressions; namespace Journal.Core.Services.Entries; public static partial class HtmlSanitizer { [GeneratedRegex("<(script|style)\\b[^>]*>.*?", RegexOptions.IgnoreCase | RegexOptions.Singleline)] private static partial Regex ScriptStyleRegex(); [GeneratedRegex("", RegexOptions.IgnoreCase)] private static partial Regex BrTagRegex(); [GeneratedRegex("", RegexOptions.IgnoreCase)] private static partial Regex BlockEndTagRegex(); [GeneratedRegex("]*>", RegexOptions.IgnoreCase)] private static partial Regex LiStartTagRegex(); [GeneratedRegex("", RegexOptions.IgnoreCase)] private static partial Regex LiEndTagRegex(); [GeneratedRegex("<(td|th)\\b[^>]*>", RegexOptions.IgnoreCase)] private static partial Regex CellStartTagRegex(); [GeneratedRegex("", RegexOptions.IgnoreCase)] private static partial Regex CellEndTagRegex(); [GeneratedRegex("]*>", RegexOptions.IgnoreCase)] private static partial Regex HrTagRegex(); [GeneratedRegex("<[^>]+>", RegexOptions.Singleline)] private static partial Regex AllTagsRegex(); [GeneratedRegex("[ \\t]{2,}")] private static partial Regex MultipleSpacesRegex(); [GeneratedRegex("\n{3,}")] private static partial Regex MultipleNewlinesRegex(); [GeneratedRegex("]*>")] private static partial Regex HtmlTagCountRegex(); public static string StripRichHtml(string content) { if (string.IsNullOrWhiteSpace(content)) return content; if (!LooksLikeRichHtml(content)) return content; var text = content.Replace("\r\n", "\n").Replace("\r", "\n"); text = ScriptStyleRegex().Replace(text, ""); text = BrTagRegex().Replace(text, "\n"); text = BlockEndTagRegex().Replace(text, "\n"); text = LiStartTagRegex().Replace(text, "\n- "); text = LiEndTagRegex().Replace(text, "\n"); text = CellStartTagRegex().Replace(text, " | "); text = CellEndTagRegex().Replace(text, " "); text = HrTagRegex().Replace(text, "\n---\n"); text = AllTagsRegex().Replace(text, ""); text = WebUtility.HtmlDecode(text) .Replace('\u00a0', ' ') .Replace("\u200b", "", StringComparison.Ordinal); text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd())); text = MultipleSpacesRegex().Replace(text, " "); text = MultipleNewlinesRegex().Replace(text, "\n\n").Trim(); return string.IsNullOrEmpty(text) ? content : text; } public static bool LooksLikeRichHtml(string content) { var lowered = content.ToLowerInvariant(); string[] markers = [ "", " lowered.Contains(marker, StringComparison.Ordinal))) return true; return HtmlTagCountRegex().Matches(lowered).Count >= 8; } }