transcription/scripts/backfill_summaries.py
keboss-m eee8f4c8a4 Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README

Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.

Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 14:24:01 +03:00

122 lines
4.1 KiB
Python

"""Перегенерация пустых или отсутствующих summary для уже обработанных совещаний."""
import argparse
import asyncio
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.paths import PROCESSED_ROOT, MEETINGS_DIRNAME
from src.config import load_config, resolve_opencode_credentials
from src.rag.formatter import build_meeting_text_only, format_summary_markdown
from src.rag.parser import classify_meeting, generate_meeting_brief
def _summary_body(summary_path: Path) -> str:
if not summary_path.exists():
return ""
text = summary_path.read_text(encoding="utf-8")
if "## Суть" not in text:
return ""
return text.split("## Суть", 1)[-1].strip()
def _find_meeting_jobs(org_slug: str) -> list[dict]:
meetings_dir = PROCESSED_ROOT / org_slug / MEETINGS_DIRNAME
if not meetings_dir.exists():
return []
jobs = []
for folder in sorted(meetings_dir.iterdir()):
if not folder.is_dir():
continue
segments_files = list(folder.glob("*_segments.json"))
if not segments_files:
continue
segments_path = segments_files[0]
stem = segments_path.name[: -len("_segments.json")]
summary_path = folder / f"{stem}_summary.md"
jobs.append({
"folder": folder,
"stem": stem,
"segments_path": segments_path,
"summary_path": summary_path,
"display_name": f"{stem}.webm",
})
return jobs
async def _regenerate_job(job: dict, config: dict, dry_run: bool) -> bool:
body = _summary_body(job["summary_path"])
if body:
return False
segments = json.loads(job["segments_path"].read_text(encoding="utf-8"))
meeting_text = build_meeting_text_only(segments)
rag_cfg = config.get("rag", {})
api_key, base_url = resolve_opencode_credentials(config)
meta_path = job["folder"] / ".project.json"
project = "unknown"
if meta_path.exists():
try:
project = json.loads(meta_path.read_text(encoding="utf-8")).get("project_slug", project)
except Exception:
pass
if not api_key:
print(f"[skip] {job['folder'].name}: нет API-ключа")
return False
if dry_run:
print(f"[dry-run] {job['folder'].name}")
return True
sections = rag_cfg.get("sections", ["Общие вопросы"])
metadata = await classify_meeting(
text=meeting_text,
project=project,
sections=sections,
api_key=api_key,
base_url=base_url,
model=rag_cfg.get("index_model", "mimo-v2.5-free"),
chunk_size=int(rag_cfg.get("classify_chunk_size", 7000)),
)
brief = await generate_meeting_brief(
text=meeting_text,
metadata=metadata,
api_key=api_key,
base_url=base_url,
model=rag_cfg.get("summary_model", "deepseek-v4-flash-free"),
chunk_size=int(rag_cfg.get("summary_chunk_size", 10000)),
)
if not (brief or "").strip():
print(f"[warn] {job['folder'].name}: LLM вернул пустой brief")
return False
summary_md = format_summary_markdown(metadata, brief, job["display_name"])
job["summary_path"].write_text(summary_md, encoding="utf-8")
print(f"[ok] {job['folder'].name} ({len(brief)} символов)")
return True
async def main():
parser = argparse.ArgumentParser(description="Backfill пустых summary совещаний")
parser.add_argument("--org", default="merakom", help="org_slug")
parser.add_argument("--dry-run", action="store_true", help="Только показать, что будет обновлено")
args = parser.parse_args()
config = load_config()
jobs = _find_meeting_jobs(args.org)
updated = 0
for job in jobs:
if await _regenerate_job(job, config, args.dry_run):
updated += 1
print(f"Готово: {updated} из {len(jobs)} совещаний")
if __name__ == "__main__":
asyncio.run(main())