transcription/scripts/migrate_lightrag_to_qmd.py
keboss-m eee8f4c8a4 Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README

Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.

Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 14:24:01 +03:00

183 lines
6.2 KiB
Python

"""migrate_lightrag_to_qmd.py — перенос индексов из LightRAG в qmd.
Идемпотентный скрипт: переиндексирует существующие .md в qmd-коллекции.
Повторный запуск безопасен (qmd content-hash проверка + перезапись stub).
Использование::
python scripts/migrate_lightrag_to_qmd.py [--org merakom] [--dry-run]
Что делает:
1. Находит все ``processed/<org>/meetings/<folder>/<stem>.md`` и ``<stem>_summary.md``.
2. Находит все ``processed/<org>/documents/<doc_id>/extracted.md``.
3. По ``.project.json`` определяет проект и вызывает
``src.rag.qmd.indexer.qmd_index_meeting`` или ``qmd_index_document``.
4. После успешной миграции помечает папку через ``.migrated_to_qmd``.
Снапшот::
Перед запуском в проде: ``tar -czf processed-pre-qmd.tar.gz processed``
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
from pathlib import Path
from typing import Iterable, Optional
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from src.config import load_config # noqa: E402
from src.rag.qmd.indexer import qmd_index_document, qmd_index_meeting # noqa: E402
MIGRATION_MARKER = ".migrated_to_qmd"
def _iter_meetings(org_dir: Path) -> Iterable[Path]:
meetings = org_dir / "meetings"
if not meetings.exists():
return
for folder in sorted(meetings.iterdir()):
if not folder.is_dir():
continue
if (folder / MIGRATION_MARKER).exists():
continue
yield folder
def _iter_documents(org_dir: Path) -> Iterable[Path]:
documents = org_dir / "documents"
if not documents.exists():
return
for folder in sorted(documents.iterdir()):
if not folder.is_dir():
continue
if (folder / MIGRATION_MARKER).exists():
continue
yield folder
def _read_project(folder: Path, fallback: Path) -> Optional[str]:
meta_path = folder / ".project.json"
if meta_path.exists():
try:
return json.loads(meta_path.read_text(encoding="utf-8")).get("project_slug")
except json.JSONDecodeError:
pass
return None
def _write_marker(folder: Path) -> None:
(folder / MIGRATION_MARKER).write_text(
json.dumps(
{"migrated_at": _now_iso(), "engine": "qmd"},
ensure_ascii=False,
),
encoding="utf-8",
)
def _now_iso() -> str:
from datetime import datetime
return datetime.now().isoformat()
async def migrate_org(org_slug: str, dry_run: bool = False) -> dict:
config = load_config()
rag_cfg = config.get("rag", {})
if not (rag_cfg.get("enabled", True) and rag_cfg.get("auto_index", True)):
print(f"[migrate] RAG disabled in config — nothing to do for org='{org_slug}'")
return {"meetings": 0, "documents": 0}
org_dir = ROOT / "processed" / org_slug
if not org_dir.exists():
print(f"[migrate] No processed/{org_slug} — skipping")
return {"meetings": 0, "documents": 0}
meetings_count = 0
documents_count = 0
errors: list[str] = []
for folder in _iter_meetings(org_dir):
project = _read_project(folder, org_dir / "meetings")
if not project:
print(f"[migrate] skip meeting folder {folder.name}: no project")
continue
body_path = next(iter(sorted(folder.glob("*.txt"))), None)
if body_path is None:
md_files = sorted(folder.glob("*.md"))
body_path = md_files[0] if md_files else None
if body_path is None:
print(f"[migrate] skip meeting folder {folder.name}: no .md/.txt")
continue
summary_path = next(iter(sorted(folder.glob("*_summary.md"))), None)
print(f"[migrate] meeting: org={org_slug} project={project} folder={folder.name}")
if dry_run:
continue
try:
await qmd_index_meeting(
org_slug=org_slug,
project_slug=project,
body_path=body_path,
summary_path=summary_path,
txt_path=body_path,
)
_write_marker(folder)
meetings_count += 1
except Exception as exc:
errors.append(f"meeting {folder.name}: {exc}")
for folder in _iter_documents(org_dir):
project = _read_project(folder, org_dir / "documents")
if not project:
print(f"[migrate] skip document folder {folder.name}: no project")
continue
extracted = folder / "extracted.md"
print(f"[migrate] document: org={org_slug} project={project} folder={folder.name}")
if dry_run:
continue
try:
await qmd_index_document(
org_slug=org_slug,
project_slug=project,
document_dir=folder,
extracted_md=extracted,
)
_write_marker(folder)
documents_count += 1
except Exception as exc:
errors.append(f"document {folder.name}: {exc}")
if errors:
print(f"[migrate] {len(errors)} errors:")
for err in errors:
print(f" - {err}")
return {"meetings": meetings_count, "documents": documents_count, "errors": len(errors)}
def main() -> int:
parser = argparse.ArgumentParser(description="Migrate LightRAG caches to qmd collections.")
parser.add_argument("--org", help="Org slug (default: bootstrap org from config)")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
args = parser.parse_args()
org_slug = args.org
if not org_slug:
config = load_config()
org_slug = config.get("auth", {}).get("bootstrap", {}).get("org_slug", "merakom")
print(f"[migrate] target org: {org_slug} dry-run: {args.dry_run}")
result = asyncio.run(migrate_org(org_slug, dry_run=args.dry_run))
print(f"[migrate] done: {result}")
return 0 if not result.get("errors") else 1
if __name__ == "__main__":
raise SystemExit(main())