423 lines
13 KiB
Python
423 lines
13 KiB
Python
from collections import Counter
|
|
import re
|
|
from typing import Any
|
|
import os
|
|
|
|
import requests
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
|
|
from journal.core.models import JournalEntry, Fragment
|
|
from journal.core.config import (
|
|
NLP_BACKEND,
|
|
LLAMA_CPP_URL,
|
|
LLAMA_CPP_MODEL,
|
|
LLAMA_CPP_TIMEOUT,
|
|
EMBEDDING_API_URL,
|
|
EMBEDDING_MODEL_NAME,
|
|
MODEL_CONTEXT_TOKENS,
|
|
CHUNK_TOKEN_BUDGET,
|
|
)
|
|
from journal.ai.api_compat import (
|
|
build_text_payload,
|
|
detect_text_endpoint_kind,
|
|
extract_embedding_response,
|
|
extract_text_response,
|
|
normalize_embedding_url,
|
|
normalize_endpoint_url,
|
|
)
|
|
|
|
_BACKEND_AUTO = "auto"
|
|
_BACKEND_SPACY = "spacy"
|
|
_BACKEND_FALLBACK = "fallback"
|
|
_VALID_BACKENDS = {_BACKEND_AUTO, _BACKEND_SPACY, _BACKEND_FALLBACK}
|
|
_backend_name: str | None = None
|
|
_spacy_nlp: Any | None = None
|
|
_fallback_warning_printed = False
|
|
_backend_requested: str | None = None
|
|
|
|
_STOP_WORDS = {
|
|
"about",
|
|
"after",
|
|
"again",
|
|
"against",
|
|
"also",
|
|
"and",
|
|
"because",
|
|
"before",
|
|
"being",
|
|
"between",
|
|
"both",
|
|
"could",
|
|
"during",
|
|
"from",
|
|
"have",
|
|
"into",
|
|
"just",
|
|
"like",
|
|
"more",
|
|
"most",
|
|
"over",
|
|
"same",
|
|
"some",
|
|
"such",
|
|
"than",
|
|
"that",
|
|
"their",
|
|
"them",
|
|
"then",
|
|
"there",
|
|
"these",
|
|
"they",
|
|
"this",
|
|
"those",
|
|
"through",
|
|
"under",
|
|
"until",
|
|
"very",
|
|
"what",
|
|
"when",
|
|
"where",
|
|
"which",
|
|
"while",
|
|
"with",
|
|
"would",
|
|
"your",
|
|
}
|
|
|
|
|
|
def _resolve_backend() -> str:
|
|
global _backend_name, _spacy_nlp, _fallback_warning_printed, _backend_requested
|
|
|
|
requested_raw = os.getenv("JOURNAL_NLP_BACKEND", NLP_BACKEND).strip().lower()
|
|
requested = requested_raw if requested_raw in _VALID_BACKENDS else _BACKEND_AUTO
|
|
|
|
# Re-resolve if the requested backend changed at runtime via settings.
|
|
if _backend_name is not None and requested == _backend_requested:
|
|
return _backend_name
|
|
|
|
_backend_name = None
|
|
_spacy_nlp = None
|
|
_backend_requested = requested
|
|
if requested == _BACKEND_FALLBACK:
|
|
_backend_name = _BACKEND_FALLBACK
|
|
return _backend_name
|
|
|
|
try:
|
|
import spacy
|
|
|
|
_spacy_nlp = spacy.load("en_core_web_sm")
|
|
_backend_name = _BACKEND_SPACY
|
|
return _backend_name
|
|
except Exception as exc:
|
|
if requested == _BACKEND_SPACY:
|
|
raise RuntimeError(
|
|
"JOURNAL_NLP_BACKEND=spacy but spaCy backend initialization failed. "
|
|
"Install optional NLP deps/model or set JOURNAL_NLP_BACKEND=auto|fallback."
|
|
) from exc
|
|
|
|
_backend_name = _BACKEND_FALLBACK
|
|
if not _fallback_warning_printed:
|
|
print(
|
|
"WARNING: spaCy backend unavailable; using fallback NLP heuristics. "
|
|
"Set JOURNAL_NLP_BACKEND=fallback to silence this warning."
|
|
)
|
|
_fallback_warning_printed = True
|
|
return _backend_name
|
|
|
|
|
|
def get_nlp_backend() -> str:
|
|
"""Returns the active NLP backend: 'spacy' or 'fallback'."""
|
|
return _resolve_backend()
|
|
|
|
|
|
def count_tokens(text: str) -> int:
|
|
# Simple token estimator: 1 token ≈ 1-4 char (very rough)
|
|
|
|
return max(1, len(text) // 4)
|
|
|
|
|
|
def llama_cpp_generate(
|
|
prompt: str,
|
|
model: str | None = None,
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 2048,
|
|
) -> str:
|
|
raw_llama_url = os.getenv("JOURNAL_LLAMA_CPP_URL", LLAMA_CPP_URL).strip() or LLAMA_CPP_URL
|
|
llama_url = normalize_endpoint_url(raw_llama_url, "/v1/completions")
|
|
llama_model = model or os.getenv("JOURNAL_LLAMA_CPP_MODEL", LLAMA_CPP_MODEL).strip() or LLAMA_CPP_MODEL
|
|
timeout_raw = os.getenv("JOURNAL_LLAMA_CPP_TIMEOUT", str(LLAMA_CPP_TIMEOUT)).strip()
|
|
try:
|
|
llama_timeout = int(timeout_raw)
|
|
except ValueError:
|
|
llama_timeout = LLAMA_CPP_TIMEOUT
|
|
if llama_timeout <= 0:
|
|
llama_timeout = LLAMA_CPP_TIMEOUT
|
|
|
|
endpoint_kind = detect_text_endpoint_kind(llama_url)
|
|
payload = build_text_payload(
|
|
prompt,
|
|
llama_model,
|
|
endpoint_kind,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
try:
|
|
response = requests.post(llama_url, json=payload, timeout=llama_timeout)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
result = extract_text_response(data)
|
|
if result:
|
|
print(f"DEBUG: Generated {len(result)} characters")
|
|
if len(result) < 10:
|
|
print(f"DEBUG: Short response: '{result}'")
|
|
return result
|
|
|
|
print("DEBUG: No parsable text in response payload")
|
|
return "No response generated."
|
|
except Exception as e:
|
|
print(f"DEBUG: Exception occurred: {e}")
|
|
return f"Error communicating with llama.cpp server: {e}"
|
|
|
|
|
|
def generate_embedding(text: str) -> list[float]:
|
|
"""
|
|
Generates an embedding for the given text using the configured embedding model.
|
|
"""
|
|
raw_embedding_url = os.getenv("JOURNAL_EMBEDDING_API_URL", EMBEDDING_API_URL).strip() or EMBEDDING_API_URL
|
|
embedding_url = normalize_embedding_url(raw_embedding_url)
|
|
embedding_model = (
|
|
os.getenv("JOURNAL_EMBEDDING_MODEL_NAME", EMBEDDING_MODEL_NAME).strip()
|
|
or EMBEDDING_MODEL_NAME
|
|
)
|
|
timeout_raw = os.getenv("JOURNAL_LLAMA_CPP_TIMEOUT", str(LLAMA_CPP_TIMEOUT)).strip()
|
|
try:
|
|
llama_timeout = int(timeout_raw)
|
|
except ValueError:
|
|
llama_timeout = LLAMA_CPP_TIMEOUT
|
|
if llama_timeout <= 0:
|
|
llama_timeout = LLAMA_CPP_TIMEOUT
|
|
|
|
payload = {
|
|
"model": embedding_model,
|
|
"input": text,
|
|
}
|
|
try:
|
|
response = requests.post(
|
|
embedding_url, json=payload, timeout=llama_timeout
|
|
) # Reusing LLAMA_CPP_TIMEOUT for now
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
embedding = extract_embedding_response(data)
|
|
if embedding:
|
|
return embedding
|
|
print("DEBUG: No embedding data in response")
|
|
return []
|
|
except Exception as e:
|
|
print(f"DEBUG: Exception occurred during embedding generation: {e}")
|
|
return []
|
|
|
|
|
|
def synthesize_summaries(chunk_summaries: list[str]) -> str:
|
|
combined = "\n\n---\n\n".join(chunk_summaries)
|
|
print(
|
|
f"DEBUG: Synthesizing {len(chunk_summaries)} summaries, total chars: {len(combined)}"
|
|
)
|
|
|
|
# Try a much simpler prompt first
|
|
prompt = (
|
|
"Please analyze and summarize the following Journals as a professional Psychologist:\n\n"
|
|
f"{combined}\n\n"
|
|
"Summary:"
|
|
)
|
|
print(f"DEBUG: Synthesis prompt length: {len(prompt)} characters")
|
|
result = llama_cpp_generate(prompt, max_tokens=2048)
|
|
print(f"DEBUG: Final synthesis result: '{result[:100]}...'") # Show first 100 chars
|
|
return result
|
|
|
|
|
|
def summarize_chunk(entries: list[JournalEntry]) -> str:
|
|
combined_text = """
|
|
|
|
---
|
|
|
|
""".join([entry.raw_content for entry in entries])
|
|
prompt = (
|
|
"You are a psychological analysis agent. Given the following journal entries, "
|
|
"analyze and report on:\n"
|
|
"- Recurring psychological themes\n"
|
|
"- Behavioral patterns\n"
|
|
"- Emotional trends\n"
|
|
"- Coping mechanisms\n"
|
|
"- Notable changes over time\n\n"
|
|
"Journal entries:\n"
|
|
f"{combined_text}\n\n"
|
|
"Respond with a concise, insightful analysis for this batch."
|
|
)
|
|
return llama_cpp_generate(prompt, max_tokens=2048)
|
|
|
|
|
|
def extract_themes(text: str) -> list[str]:
|
|
backend = _resolve_backend()
|
|
if backend == _BACKEND_SPACY and _spacy_nlp is not None:
|
|
try:
|
|
doc = _spacy_nlp(text)
|
|
themes = []
|
|
for ent in doc.ents:
|
|
if ent.label_ in [
|
|
"PERSON",
|
|
"ORG",
|
|
"EVENT",
|
|
"WORK_OF_ART",
|
|
"LAW",
|
|
"LANGUAGE",
|
|
]:
|
|
themes.append(ent.text.lower())
|
|
for chunk in doc.noun_chunks:
|
|
if 2 <= len(chunk.text.split()) <= 4:
|
|
themes.append(chunk.text.lower())
|
|
theme_counts = Counter(themes)
|
|
return [
|
|
theme for theme, count in theme_counts.most_common(10) if count > 1
|
|
]
|
|
except Exception:
|
|
# Fall through to non-spaCy extraction when model parsing fails at runtime.
|
|
pass
|
|
|
|
return _extract_themes_fallback(text)
|
|
|
|
|
|
def _extract_themes_fallback(text: str) -> list[str]:
|
|
words = re.findall(r"[A-Za-z][A-Za-z'-]{2,}", text.lower())
|
|
filtered_words = [w for w in words if w not in _STOP_WORDS]
|
|
if not filtered_words:
|
|
return []
|
|
|
|
single_counts = Counter(filtered_words)
|
|
phrase_counts = Counter()
|
|
for first, second in zip(filtered_words, filtered_words[1:]):
|
|
if first == second:
|
|
continue
|
|
phrase_counts[f"{first} {second}"] += 1
|
|
|
|
themes: list[str] = []
|
|
for phrase, count in phrase_counts.most_common(20):
|
|
if count > 1:
|
|
themes.append(phrase)
|
|
if len(themes) >= 10:
|
|
return themes
|
|
|
|
for word, count in single_counts.most_common(30):
|
|
if count > 1 and word not in themes:
|
|
themes.append(word)
|
|
if len(themes) >= 10:
|
|
break
|
|
|
|
return themes
|
|
|
|
|
|
def analyze_fragments(fragments: list[Fragment]) -> str:
|
|
if not fragments:
|
|
return "No fragments recorded."
|
|
fragment_types = Counter([frag.type for frag in fragments])
|
|
all_tags = []
|
|
for frag in fragments:
|
|
all_tags.extend(frag.tags)
|
|
tag_counts = Counter(all_tags)
|
|
analysis = f"{len(fragments)} discrete events recorded. "
|
|
if fragment_types:
|
|
top_type = fragment_types.most_common(1)[0]
|
|
analysis += f"Most frequent: {top_type[0]} ({top_type[1]} times). "
|
|
if tag_counts:
|
|
top_tags = [tag for tag, _ in tag_counts.most_common(3)]
|
|
analysis += f"Key themes: {', '.join(top_tags)}."
|
|
return analysis
|
|
|
|
|
|
def summarize_all_entries(entries: list[JournalEntry]) -> str:
|
|
_ = _resolve_backend()
|
|
if not entries:
|
|
return "No entries found to analyze."
|
|
|
|
# Chunk entries to fit model context
|
|
chunks = chunk_journal_entries(entries)
|
|
chunk_summaries = []
|
|
for i, chunk in enumerate(chunks):
|
|
print(f"Analyzing chunk {i + 1}/{len(chunks)} ({len(chunk)} entries)...")
|
|
summary = summarize_chunk(chunk)
|
|
chunk_summaries.append(summary)
|
|
|
|
print("Synthesizing final report...")
|
|
final_report = synthesize_summaries(chunk_summaries)
|
|
return final_report
|
|
|
|
|
|
def identify_patterns(entries: list[JournalEntry]) -> list[str]:
|
|
_ = _resolve_backend()
|
|
if not entries:
|
|
return ["No entries to analyze."]
|
|
all_content = [entry.raw_content for entry in entries]
|
|
dates = [entry.date for entry in entries]
|
|
combined_text = " ".join(all_content)
|
|
prompt = (
|
|
f"You are a psychological pattern analysis agent. "
|
|
f"Given the following journal entries, identify:\n"
|
|
f"- Recurring psychological themes\n"
|
|
f"- Behavioral patterns\n"
|
|
f"- Emotional trends\n"
|
|
f"- Coping mechanisms\n"
|
|
f"- Notable changes over time\n\n"
|
|
f"Journal entries span from {dates[0]} to {dates[-1]}.\n"
|
|
f"Entries:\n{combined_text}\n\n"
|
|
f"Respond with a concise, insightful pattern analysis."
|
|
)
|
|
return [llama_cpp_generate(prompt)]
|
|
|
|
|
|
def chunk_journal_entries(
|
|
entries: list[JournalEntry], token_budget: int | None = None
|
|
) -> list[list[JournalEntry]]:
|
|
if token_budget is None:
|
|
budget_raw = os.getenv("JOURNAL_CHUNK_TOKEN_BUDGET", str(CHUNK_TOKEN_BUDGET)).strip()
|
|
try:
|
|
token_budget = int(budget_raw)
|
|
except ValueError:
|
|
token_budget = CHUNK_TOKEN_BUDGET
|
|
if token_budget <= 0:
|
|
token_budget = CHUNK_TOKEN_BUDGET
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_tokens = 0
|
|
|
|
for entry in entries:
|
|
entry_tokens = count_tokens(entry.raw_content)
|
|
|
|
if current_tokens + entry_tokens > token_budget and current_chunk:
|
|
chunks.append(current_chunk)
|
|
current_chunk = []
|
|
current_tokens = 0
|
|
|
|
current_chunk.append(entry)
|
|
current_tokens += entry_tokens
|
|
|
|
if current_chunk:
|
|
chunks.append(current_chunk)
|
|
return chunks
|
|
|
|
|
|
def summarize_entry(entry: JournalEntry) -> str:
|
|
_ = _resolve_backend()
|
|
prompt = (
|
|
"You are a psychological analysis agent. Given the following journal entry, "
|
|
"analyze and report on:\n"
|
|
"- Recurring psychological themes\n"
|
|
"- Behavioral patterns\n\n"
|
|
"Journal entry:\n"
|
|
f"{entry.raw_content}\n\n"
|
|
"Respond with a concise, insightful analysis."
|
|
)
|
|
return llama_cpp_generate(prompt, max_tokens=2048)
|