transcription/backend/paths.py
keboss-m eee8f4c8a4 Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README

Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.

Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 14:24:01 +03:00

77 lines
2.3 KiB
Python

"""Org-scoped filesystem paths."""
import json
import os
from datetime import datetime
from pathlib import Path
DATA_ROOT = Path("data")
UPLOAD_ROOT = Path("uploads")
PROCESSED_ROOT = Path("processed")
RAG_CACHE_DIRNAME = "lightrag_caches"
QMD_COLLECTIONS_DIRNAME = "qmd_collections"
MEETINGS_DIRNAME = "meetings"
DOCUMENTS_DIRNAME = "documents"
def org_upload_dir(org_slug: str, user_id: int) -> Path:
path = UPLOAD_ROOT / org_slug / str(user_id)
path.mkdir(parents=True, exist_ok=True)
return path
def org_meetings_dir(org_slug: str) -> Path:
path = PROCESSED_ROOT / org_slug / MEETINGS_DIRNAME
path.mkdir(parents=True, exist_ok=True)
return path
def org_rag_index_dir(org_slug: str) -> Path:
"""Legacy: путь к lightrag_caches/<org>/ (deprecated, kept for migration)."""
path = PROCESSED_ROOT / org_slug / RAG_CACHE_DIRNAME
path.mkdir(parents=True, exist_ok=True)
return path
def org_qmd_root(org_slug: str) -> Path:
"""Корень qmd-коллекций организации: ``processed/<org>/qmd_collections/``."""
env_root = os.environ.get("QMD_COLLECTION_ROOT")
base = Path(env_root) if env_root else PROCESSED_ROOT
path = base / org_slug / QMD_COLLECTIONS_DIRNAME
path.mkdir(parents=True, exist_ok=True)
return path
def org_documents_dir(org_slug: str) -> Path:
path = PROCESSED_ROOT / org_slug / DOCUMENTS_DIRNAME
path.mkdir(parents=True, exist_ok=True)
return path
def resolve_document_path(org_slug: str, rel_path: str) -> Path:
base = org_documents_dir(org_slug).resolve()
full = (base / rel_path).resolve()
if not str(full).startswith(str(base)):
raise ValueError("Invalid path")
return full
def resolve_meeting_path(org_slug: str, rel_path: str) -> Path:
"""Resolve relative path under org meetings dir; reject traversal."""
base = org_meetings_dir(org_slug).resolve()
full = (base / rel_path).resolve()
if not str(full).startswith(str(base)):
raise ValueError("Invalid path")
return full
def write_folder_project_meta(folder_path: Path, project_slug: str) -> None:
meta = {
"project_slug": project_slug.strip().lower(),
"created_at": datetime.now().isoformat(),
}
(folder_path / ".project.json").write_text(
json.dumps(meta, ensure_ascii=False),
encoding="utf-8",
)