- Move standalone fragment storage from unencrypted SQLite to the existing encrypted SQLCipher database (journal_cache.db) - Add IDatabaseSessionService/DatabaseSessionService for shared encrypted connection management after authentication - Update fragments table schema: nullable entry_id, add guid column - Reorganize flat Services/ directory (28 files) into 9 domain modules: Ai, Config, Database, Entries, Fragments, Logging, Sidecar, Speech, Vault - Update all namespace declarations and using statements across all projects - Update REFACTORING_SUMMARY.md with all changes Co-Authored-By: Warp <agent@warp.dev>
47 lines
2.1 KiB
C#
47 lines
2.1 KiB
C#
using System.Net;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace Journal.Core.Services.Entries;
|
|
|
|
public static class HtmlSanitizer
|
|
{
|
|
public static string StripRichHtml(string content)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(content))
|
|
return content;
|
|
if (!LooksLikeRichHtml(content))
|
|
return content;
|
|
|
|
var text = content.Replace("\r\n", "\n").Replace("\r", "\n");
|
|
text = Regex.Replace(text, "<(script|style)\\b[^>]*>.*?</\\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
text = Regex.Replace(text, "<br\\s*/?>", "\n", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "</(p|div|h[1-6]|tr|table|ul|ol|blockquote)>", "\n", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "<li\\b[^>]*>", "\n- ", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "</li>", "\n", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "<(td|th)\\b[^>]*>", " | ", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "</(td|th)>", " ", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "<hr\\b[^>]*>", "\n---\n", RegexOptions.IgnoreCase);
|
|
text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Singleline);
|
|
text = WebUtility.HtmlDecode(text)
|
|
.Replace('\u00a0', ' ')
|
|
.Replace("\u200b", "", StringComparison.Ordinal);
|
|
text = string.Join("\n", text.Split('\n').Select(line => line.TrimEnd()));
|
|
text = Regex.Replace(text, "[ \\t]{2,}", " ");
|
|
text = Regex.Replace(text, "\n{3,}", "\n\n").Trim();
|
|
return string.IsNullOrEmpty(text) ? content : text;
|
|
}
|
|
|
|
public static bool LooksLikeRichHtml(string content)
|
|
{
|
|
var lowered = content.ToLowerInvariant();
|
|
string[] markers =
|
|
[
|
|
"<p", "</p>", "<div", "<span", "<table", "<tr", "<td", "<li", "<ul", "<ol",
|
|
"style=", "font-family:", "-webkit-text-stroke"
|
|
];
|
|
if (markers.Any(marker => lowered.Contains(marker, StringComparison.Ordinal)))
|
|
return true;
|
|
return Regex.Matches(lowered, "</?[a-z][^>]*>").Count >= 8;
|
|
}
|
|
}
|