From 7865e3bc8b46ff798f527caf2c4fc32d22fecb5b Mon Sep 17 00:00:00 2001 From: Jacob Schmidt Date: Tue, 24 Feb 2026 18:50:15 -0600 Subject: [PATCH] refactor: use GeneratedRegex in HtmlSanitizer for compile-time optimization - Convert HtmlSanitizer to partial class with [GeneratedRegex] attributes - Replace 11 runtime Regex.Replace/Matches calls with generated methods - Eliminates regex compilation overhead for HTML sanitization operations - Update REFACTORING_SUMMARY.md to document the optimization Co-Authored-By: Oz --- .../Services/Entries/HtmlSanitizer.cs | 61 +++++++++++++++---- REFACTORING_SUMMARY.md | 2 +- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/Journal.Core/Services/Entries/HtmlSanitizer.cs b/Journal.Core/Services/Entries/HtmlSanitizer.cs index 163c538..7b31c48 100644 --- a/Journal.Core/Services/Entries/HtmlSanitizer.cs +++ b/Journal.Core/Services/Entries/HtmlSanitizer.cs @@ -3,8 +3,43 @@ using System.Text.RegularExpressions; namespace Journal.Core.Services.Entries; -public static class HtmlSanitizer +public static partial class HtmlSanitizer { + [GeneratedRegex("<(script|style)\\b[^>]*>.*?", RegexOptions.IgnoreCase | RegexOptions.Singleline)] + private static partial Regex ScriptStyleRegex(); + + [GeneratedRegex("", RegexOptions.IgnoreCase)] + private static partial Regex BrTagRegex(); + + [GeneratedRegex("", RegexOptions.IgnoreCase)] + private static partial Regex BlockEndTagRegex(); + + [GeneratedRegex("]*>", RegexOptions.IgnoreCase)] + private static partial Regex LiStartTagRegex(); + + [GeneratedRegex("", RegexOptions.IgnoreCase)] + private static partial Regex LiEndTagRegex(); + + [GeneratedRegex("<(td|th)\\b[^>]*>", RegexOptions.IgnoreCase)] + private static partial Regex CellStartTagRegex(); + + [GeneratedRegex("", RegexOptions.IgnoreCase)] + private static partial Regex CellEndTagRegex(); + + [GeneratedRegex("]*>", RegexOptions.IgnoreCase)] + private static partial Regex HrTagRegex(); + + [GeneratedRegex("<[^>]+>", RegexOptions.Singleline)] + private static partial Regex AllTagsRegex(); + + [GeneratedRegex("[ \\t]{2,}")] + private static partial Regex MultipleSpacesRegex(); + + [GeneratedRegex("\n{3,}")] + private static partial Regex MultipleNewlinesRegex(); + + [GeneratedRegex("]*>")] + private static partial Regex HtmlTagCountRegex(); public static string StripRichHtml(string content) { if (string.IsNullOrWhiteSpace(content)) @@ -13,21 +48,21 @@ public static class HtmlSanitizer return content; var text = content.Replace("\r\n", "\n").Replace("\r", "\n"); - text = Regex.Replace(text, "<(script|style)\\b[^>]*>.*?", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); - text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "]*>", "\n- ", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "<(td|th)\\b[^>]*>", " | ", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "]*>", "\n---\n", RegexOptions.IgnoreCase); - text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Singleline); + text = ScriptStyleRegex().Replace(text, ""); + text = BrTagRegex().Replace(text, "\n"); + text = BlockEndTagRegex().Replace(text, "\n"); + text = LiStartTagRegex().Replace(text, "\n- "); + text = LiEndTagRegex().Replace(text, "\n"); + text = CellStartTagRegex().Replace(text, " | "); + text = CellEndTagRegex().Replace(text, " "); + text = HrTagRegex().Replace(text, "\n---\n"); + text = AllTagsRegex().Replace(text, ""); text = WebUtility.HtmlDecode(text) .Replace('\u00a0', ' ') .Replace("\u200b", "", StringComparison.Ordinal); text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd())); - text = Regex.Replace(text, "[ \\t]{2,}", " "); - text = Regex.Replace(text, "\n{3,}", "\n\n").Trim(); + text = MultipleSpacesRegex().Replace(text, " "); + text = MultipleNewlinesRegex().Replace(text, "\n\n").Trim(); return string.IsNullOrEmpty(text) ? content : text; } @@ -41,6 +76,6 @@ public static class HtmlSanitizer ]; if (markers.Any(marker => lowered.Contains(marker, StringComparison.Ordinal))) return true; - return Regex.Matches(lowered, "]*>").Count >= 8; + return HtmlTagCountRegex().Matches(lowered).Count >= 8; } } diff --git a/REFACTORING_SUMMARY.md b/REFACTORING_SUMMARY.md index bda1ebd..a52d005 100644 --- a/REFACTORING_SUMMARY.md +++ b/REFACTORING_SUMMARY.md @@ -9,7 +9,7 @@ Removed all business logic, HTML processing, logging implementation, and private record types. `Entry` now only parses the incoming JSON command, routes to the correct service, and returns the `{ok, data}` / `{ok: false, error}` envelope. ### 2. Extracted `HtmlSanitizer` (new file) -`StripRichHtml` and `LooksLikeRichHtml` moved from `Entry.cs` to `Services/HtmlSanitizer.cs` as a static utility class. +`StripRichHtml` and `LooksLikeRichHtml` moved from `Entry.cs` to `Services/Entries/HtmlSanitizer.cs` as a static utility class. Refactored to use `[GeneratedRegex]` attributes for compile-time regex generation, improving performance by eliminating runtime regex compilation overhead. ### 3. Extracted `CommandLogger` (new file) `LogStart`, `LogSuccess`, `LogFailure`, `EmitLog`, `ShouldLog`, and `LogLevelRank` moved from `Entry.cs` to `Services/CommandLogger.cs`. Entry now receives this as a dependency.