using System.Net;
using System.Text.RegularExpressions;
namespace Journal.Core.Services.Entries;
public static class HtmlSanitizer
{
public static string StripRichHtml(string content)
{
if (string.IsNullOrWhiteSpace(content))
return content;
if (!LooksLikeRichHtml(content))
return content;
var text = content.Replace("\r\n", "\n").Replace("\r", "\n");
text = Regex.Replace(text, "<(script|style)\\b[^>]*>.*?\\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
text = Regex.Replace(text, "
", "\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "(p|div|h[1-6]|tr|table|ul|ol|blockquote)>", "\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "
]*>", "\n- ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "", "\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<(td|th)\\b[^>]*>", " | ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "(td|th)>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "
]*>", "\n---\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Singleline);
text = WebUtility.HtmlDecode(text)
.Replace('\u00a0', ' ')
.Replace("\u200b", "", StringComparison.Ordinal);
text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd()));
text = Regex.Replace(text, "[ \\t]{2,}", " ");
text = Regex.Replace(text, "\n{3,}", "\n\n").Trim();
return string.IsNullOrEmpty(text) ? content : text;
}
public static bool LooksLikeRichHtml(string content)
{
var lowered = content.ToLowerInvariant();
string[] markers =
[
"", "
lowered.Contains(marker, StringComparison.Ordinal)))
return true;
return Regex.Matches(lowered, "?[a-z][^>]*>").Count >= 8;
}
}