journal/Journal.Core/Services/Entries/HtmlSanitizer.cs

using System.Net;
using System.Text.RegularExpressions;

namespace Journal.Core.Services.Entries;

public static partial class HtmlSanitizer
{
    [GeneratedRegex("<(script|style)\\b[^>]*>.*?</\\1>", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
    private static partial Regex ScriptStyleRegex();

    [GeneratedRegex("<br\\s*/?>", RegexOptions.IgnoreCase)]
    private static partial Regex BrTagRegex();

    [GeneratedRegex("</(p|div|h[1-6]|tr|table|ul|ol|blockquote)>", RegexOptions.IgnoreCase)]
    private static partial Regex BlockEndTagRegex();

    [GeneratedRegex("<li\\b[^>]*>", RegexOptions.IgnoreCase)]
    private static partial Regex LiStartTagRegex();

    [GeneratedRegex("</li>", RegexOptions.IgnoreCase)]
    private static partial Regex LiEndTagRegex();

    [GeneratedRegex("<(td|th)\\b[^>]*>", RegexOptions.IgnoreCase)]
    private static partial Regex CellStartTagRegex();

    [GeneratedRegex("</(td|th)>", RegexOptions.IgnoreCase)]
    private static partial Regex CellEndTagRegex();

    [GeneratedRegex("<hr\\b[^>]*>", RegexOptions.IgnoreCase)]
    private static partial Regex HrTagRegex();

    [GeneratedRegex("<[^>]+>", RegexOptions.Singleline)]
    private static partial Regex AllTagsRegex();

    [GeneratedRegex("[ \\t]{2,}")]
    private static partial Regex MultipleSpacesRegex();

    [GeneratedRegex("\n{3,}")]
    private static partial Regex MultipleNewlinesRegex();

    [GeneratedRegex("</?[a-z][^>]*>")]
    private static partial Regex HtmlTagCountRegex();
    public static string StripRichHtml(string content)
    {
        if (string.IsNullOrWhiteSpace(content))
            return content;
        if (!LooksLikeRichHtml(content))
            return content;

        var text = content.Replace("\r\n", "\n").Replace("\r", "\n");
        text = ScriptStyleRegex().Replace(text, "");
        text = BrTagRegex().Replace(text, "\n");
        text = BlockEndTagRegex().Replace(text, "\n");
        text = LiStartTagRegex().Replace(text, "\n- ");
        text = LiEndTagRegex().Replace(text, "\n");
        text = CellStartTagRegex().Replace(text, " | ");
        text = CellEndTagRegex().Replace(text, " ");
        text = HrTagRegex().Replace(text, "\n---\n");
        text = AllTagsRegex().Replace(text, "");
        text = WebUtility.HtmlDecode(text)
            .Replace('\u00a0', ' ')
            .Replace("\u200b", "", StringComparison.Ordinal);
        text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd()));
        text = MultipleSpacesRegex().Replace(text, " ");
        text = MultipleNewlinesRegex().Replace(text, "\n\n").Trim();
        return string.IsNullOrEmpty(text) ? content : text;
    }

    public static bool LooksLikeRichHtml(string content)
    {
        var lowered = content.ToLowerInvariant();
        string[] markers =
        [
            "<p", "</p>", "<div", "<span", "<table", "<tr", "<td", "<li", "<ul", "<ol",
            "style=", "font-family:", "-webkit-text-stroke"
        ];
        if (markers.Any(marker => lowered.Contains(marker, StringComparison.Ordinal)))
            return true;
        return HtmlTagCountRegex().Matches(lowered).Count >= 8;
    }
}