refactor: use GeneratedRegex in HtmlSanitizer for compile-time optimization
- Convert HtmlSanitizer to partial class with [GeneratedRegex] attributes - Replace 11 runtime Regex.Replace/Matches calls with generated methods - Eliminates regex compilation overhead for HTML sanitization operations - Update REFACTORING_SUMMARY.md to document the optimization Co-Authored-By: Oz <oz-agent@warp.dev>
This commit is contained in:
parent
a0258446d7
commit
7865e3bc8b
@ -3,8 +3,43 @@ using System.Text.RegularExpressions;
|
||||
|
||||
namespace Journal.Core.Services.Entries;
|
||||
|
||||
public static class HtmlSanitizer
|
||||
public static partial class HtmlSanitizer
|
||||
{
|
||||
[GeneratedRegex("<(script|style)\\b[^>]*>.*?</\\1>", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
|
||||
private static partial Regex ScriptStyleRegex();
|
||||
|
||||
[GeneratedRegex("<br\\s*/?>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex BrTagRegex();
|
||||
|
||||
[GeneratedRegex("</(p|div|h[1-6]|tr|table|ul|ol|blockquote)>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex BlockEndTagRegex();
|
||||
|
||||
[GeneratedRegex("<li\\b[^>]*>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex LiStartTagRegex();
|
||||
|
||||
[GeneratedRegex("</li>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex LiEndTagRegex();
|
||||
|
||||
[GeneratedRegex("<(td|th)\\b[^>]*>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CellStartTagRegex();
|
||||
|
||||
[GeneratedRegex("</(td|th)>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CellEndTagRegex();
|
||||
|
||||
[GeneratedRegex("<hr\\b[^>]*>", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex HrTagRegex();
|
||||
|
||||
[GeneratedRegex("<[^>]+>", RegexOptions.Singleline)]
|
||||
private static partial Regex AllTagsRegex();
|
||||
|
||||
[GeneratedRegex("[ \\t]{2,}")]
|
||||
private static partial Regex MultipleSpacesRegex();
|
||||
|
||||
[GeneratedRegex("\n{3,}")]
|
||||
private static partial Regex MultipleNewlinesRegex();
|
||||
|
||||
[GeneratedRegex("</?[a-z][^>]*>")]
|
||||
private static partial Regex HtmlTagCountRegex();
|
||||
public static string StripRichHtml(string content)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(content))
|
||||
@ -13,21 +48,21 @@ public static class HtmlSanitizer
|
||||
return content;
|
||||
|
||||
var text = content.Replace("\r\n", "\n").Replace("\r", "\n");
|
||||
text = Regex.Replace(text, "<(script|style)\\b[^>]*>.*?</\\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
text = Regex.Replace(text, "<br\\s*/?>", "\n", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "</(p|div|h[1-6]|tr|table|ul|ol|blockquote)>", "\n", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "<li\\b[^>]*>", "\n- ", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "</li>", "\n", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "<(td|th)\\b[^>]*>", " | ", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "</(td|th)>", " ", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "<hr\\b[^>]*>", "\n---\n", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Singleline);
|
||||
text = ScriptStyleRegex().Replace(text, "");
|
||||
text = BrTagRegex().Replace(text, "\n");
|
||||
text = BlockEndTagRegex().Replace(text, "\n");
|
||||
text = LiStartTagRegex().Replace(text, "\n- ");
|
||||
text = LiEndTagRegex().Replace(text, "\n");
|
||||
text = CellStartTagRegex().Replace(text, " | ");
|
||||
text = CellEndTagRegex().Replace(text, " ");
|
||||
text = HrTagRegex().Replace(text, "\n---\n");
|
||||
text = AllTagsRegex().Replace(text, "");
|
||||
text = WebUtility.HtmlDecode(text)
|
||||
.Replace('\u00a0', ' ')
|
||||
.Replace("\u200b", "", StringComparison.Ordinal);
|
||||
text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd()));
|
||||
text = Regex.Replace(text, "[ \\t]{2,}", " ");
|
||||
text = Regex.Replace(text, "\n{3,}", "\n\n").Trim();
|
||||
text = MultipleSpacesRegex().Replace(text, " ");
|
||||
text = MultipleNewlinesRegex().Replace(text, "\n\n").Trim();
|
||||
return string.IsNullOrEmpty(text) ? content : text;
|
||||
}
|
||||
|
||||
@ -41,6 +76,6 @@ public static class HtmlSanitizer
|
||||
];
|
||||
if (markers.Any(marker => lowered.Contains(marker, StringComparison.Ordinal)))
|
||||
return true;
|
||||
return Regex.Matches(lowered, "</?[a-z][^>]*>").Count >= 8;
|
||||
return HtmlTagCountRegex().Matches(lowered).Count >= 8;
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
Removed all business logic, HTML processing, logging implementation, and private record types. `Entry` now only parses the incoming JSON command, routes to the correct service, and returns the `{ok, data}` / `{ok: false, error}` envelope.
|
||||
|
||||
### 2. Extracted `HtmlSanitizer` (new file)
|
||||
`StripRichHtml` and `LooksLikeRichHtml` moved from `Entry.cs` to `Services/HtmlSanitizer.cs` as a static utility class.
|
||||
`StripRichHtml` and `LooksLikeRichHtml` moved from `Entry.cs` to `Services/Entries/HtmlSanitizer.cs` as a static utility class. Refactored to use `[GeneratedRegex]` attributes for compile-time regex generation, improving performance by eliminating runtime regex compilation overhead.
|
||||
|
||||
### 3. Extracted `CommandLogger` (new file)
|
||||
`LogStart`, `LogSuccess`, `LogFailure`, `EmitLog`, `ShouldLog`, and `LogLevelRank` moved from `Entry.cs` to `Services/CommandLogger.cs`. Entry now receives this as a dependency.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user