Project_Journal-Csharp_back.../journal/core/parser.py

import sys
import re
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
from .models import JournalEntry, Fragment, ParsedSection, SECTION_TITLES

CHECKBOX_PATTERN = re.compile(r"^\s*[-*]\s*\[([xX ])\]\s*(.*)$")


def parse_journal_file(file_path: str) -> JournalEntry:
    content = Path(file_path).read_text(encoding="utf-8")
    return parse_journal_content(content, Path(file_path).stem)


def parse_journal_content(content: str, file_stem: str) -> JournalEntry:
    """Parses the raw text content of a journal entry."""
    date_match = re.search(r"(?:\*\*Date:\*\*|Date:)\s*(.+)", content)
    date = date_match.group(1).strip() if date_match else file_stem

    parsed_sections: dict[str, ParsedSection] = {}
    current_section_title: str | None = None
    current_section_content: list[str] = []
    current_section_checkboxes: dict[str, bool] = {}

    # Iterate through blocks to find sections
    # We need to re-parse the content to correctly associate lines with sections
    lines = content.splitlines()
    for line in lines:
        section_header_match = re.match(r"^\#\#+\s*(.*)$", line.strip())
        if section_header_match:
            # Save previous section if exists
            if current_section_title:
                parsed_sections[current_section_title] = ParsedSection(
                    title=current_section_title,
                    content=current_section_content,
                    checkboxes=current_section_checkboxes,
                )

            # Start new section
            header_text = section_header_match.group(1).strip()
            found_title = None
            for title_key in SECTION_TITLES:
                if title_key.lower() in header_text.lower():
                    found_title = title_key
                    break

            if found_title:
                current_section_title = found_title
                current_section_content = []
                current_section_checkboxes = {}
            else:
                current_section_title = None  # Not a recognized section
                current_section_content = []
                current_section_checkboxes = {}
            continue  # Don't add the header itself to the content

        if current_section_title:
            checkbox_match = CHECKBOX_PATTERN.match(line)
            if checkbox_match:
                is_checked = checkbox_match.group(1).strip().lower() == "x"
                checkbox_text = checkbox_match.group(2).strip()
                current_section_checkboxes[checkbox_text] = is_checked
            current_section_content.append(line)

    # Save the last section
    if current_section_title:
        parsed_sections[current_section_title] = ParsedSection(
            title=current_section_title,
            content=current_section_content,
            checkboxes=current_section_checkboxes,
        )

    fragments: list[Fragment] = []
    # Regex for !TYPE @time #tag1 #tag2 description (can be multi-line)
    # This pattern is more robust for fragments that might span multiple lines
    fragment_pattern = re.compile(
        r"^(!\w+)\s*((?:@\S+\s*)?)(?:\s*((?:#\S+\s*)*))?\s*\n"  # Type, optional time, optional tags, newline
        + r"((?:(?!^!\w+\s*).*\n)*)",  # Content lines (non-fragment start) until next fragment or end
        re.MULTILINE,
    )

    for match in fragment_pattern.finditer(content):
        frag_type = match.group(1)
        time_str = match.group(2).strip().lstrip("@") if match.group(2) else None
        tag_str = match.group(3).strip() if match.group(3) else ""
        description = match.group(4).strip()

        tags = [t.strip().lstrip("#") for t in tag_str.split()] if tag_str else []
        fragments.append(
            Fragment(type=frag_type, description=description, time=time_str, tags=tags)
        )

    return JournalEntry(
        date=date, raw_content=content, fragments=fragments, sections=parsed_sections
    )