Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README
Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.
Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 11:24:01 +00:00
|
|
|
|
"""Форматирование документа совещания для индексации в qmd (knowledge base)."""
|
2026-06-01 14:40:58 +00:00
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
|
|
|
|
|
|
from src.document import format_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_meeting_document(
|
|
|
|
|
|
segments: List[Dict[str, Any]],
|
|
|
|
|
|
metadata: Dict[str, Any],
|
|
|
|
|
|
source_filename: str,
|
|
|
|
|
|
) -> str:
|
Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README
Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.
Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 11:24:01 +00:00
|
|
|
|
"""Собирает текстовый документ для индексации в qmd.
|
2026-06-01 14:40:58 +00:00
|
|
|
|
|
|
|
|
|
|
Сохраняет полную расшифровку + метаданные + извлечённые сущности.
|
|
|
|
|
|
"""
|
|
|
|
|
|
lines = []
|
|
|
|
|
|
lines.append("=== СОВЕЩАНИЕ ===")
|
|
|
|
|
|
lines.append(f"ID: {metadata.get('project', 'unknown')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
|
|
|
|
|
|
lines.append(f"Проект: {metadata.get('project', 'unknown')}")
|
|
|
|
|
|
lines.append(f"Раздел: {metadata.get('section', 'Общие вопросы')}")
|
|
|
|
|
|
lines.append(f"Тема: {metadata.get('topic', 'Не определена')}")
|
|
|
|
|
|
lines.append(f"Дата: {metadata.get('date', 'Не указана')}")
|
|
|
|
|
|
lines.append(f"Источник: {source_filename}")
|
|
|
|
|
|
|
|
|
|
|
|
participants = metadata.get("participants", [])
|
|
|
|
|
|
if participants:
|
|
|
|
|
|
lines.append(f"Участники: {', '.join(participants)}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
# fallback — извлечь уникальных спикеров из сегментов
|
|
|
|
|
|
speakers = sorted({seg.get("speaker", "UNKNOWN") for seg in segments})
|
|
|
|
|
|
lines.append(f"Участники: {', '.join(speakers)}")
|
|
|
|
|
|
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
lines.append("--- Метаданные ---")
|
|
|
|
|
|
summary = metadata.get("summary", "")
|
|
|
|
|
|
if summary:
|
|
|
|
|
|
lines.append(f"Summary: {summary}")
|
|
|
|
|
|
|
|
|
|
|
|
decisions = metadata.get("key_decisions", [])
|
|
|
|
|
|
if decisions:
|
|
|
|
|
|
lines.append("Решения:")
|
|
|
|
|
|
for i, d in enumerate(decisions, 1):
|
|
|
|
|
|
lines.append(f" {i}. {d}")
|
|
|
|
|
|
|
|
|
|
|
|
actions = metadata.get("action_items", [])
|
|
|
|
|
|
if actions:
|
|
|
|
|
|
lines.append("Action items:")
|
|
|
|
|
|
for a in actions:
|
|
|
|
|
|
who = a.get("who", "?")
|
|
|
|
|
|
what = a.get("what", "")
|
|
|
|
|
|
deadline = a.get("deadline", "")
|
|
|
|
|
|
dl = f" (до {deadline})" if deadline else ""
|
|
|
|
|
|
lines.append(f" - {who}: {what}{dl}")
|
|
|
|
|
|
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
lines.append("--- Полная расшифровка ---")
|
|
|
|
|
|
for seg in segments:
|
|
|
|
|
|
ts = format_time(seg.get("start", 0.0))
|
|
|
|
|
|
speaker = seg.get("speaker", "UNKNOWN")
|
|
|
|
|
|
text = seg.get("text", "").strip()
|
|
|
|
|
|
if text:
|
|
|
|
|
|
lines.append(f"[{ts}] {speaker}: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_summary_markdown(
|
|
|
|
|
|
metadata: Dict[str, Any],
|
|
|
|
|
|
brief: str,
|
|
|
|
|
|
source_filename: str,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
"""Формирует markdown-файл краткого содержания совещания."""
|
|
|
|
|
|
lines = [
|
|
|
|
|
|
"# Краткое содержание совещания",
|
|
|
|
|
|
"",
|
|
|
|
|
|
f"**Проект:** {metadata.get('project', '—')} ",
|
|
|
|
|
|
f"**Раздел:** {metadata.get('section', '—')} ",
|
|
|
|
|
|
f"**Тема:** {metadata.get('topic', '—')} ",
|
|
|
|
|
|
f"**Дата:** {metadata.get('date') or '—'} ",
|
|
|
|
|
|
f"**Источник:** {source_filename}",
|
|
|
|
|
|
"",
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
participants = metadata.get("participants") or []
|
|
|
|
|
|
if participants:
|
|
|
|
|
|
lines.append(f"**Участники:** {', '.join(participants)}")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
|
|
lines.extend(["## Суть", "", brief.strip(), ""])
|
|
|
|
|
|
|
|
|
|
|
|
problems = metadata.get("problems") or []
|
|
|
|
|
|
if problems:
|
|
|
|
|
|
lines.append("## Ключевые вопросы и проблемы")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
for item in problems:
|
|
|
|
|
|
lines.append(f"- {item}")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
|
|
decisions = metadata.get("key_decisions") or []
|
|
|
|
|
|
if decisions:
|
|
|
|
|
|
lines.append("## Принятые решения")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
for item in decisions:
|
|
|
|
|
|
lines.append(f"- {item}")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
|
|
actions = metadata.get("action_items") or []
|
|
|
|
|
|
if actions:
|
|
|
|
|
|
lines.append("## Поручения")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
for action in actions:
|
|
|
|
|
|
who = action.get("who", "?")
|
|
|
|
|
|
what = action.get("what", "")
|
|
|
|
|
|
deadline = action.get("deadline")
|
|
|
|
|
|
dl = f" (до {deadline})" if deadline else ""
|
|
|
|
|
|
lines.append(f"- **{who}:** {what}{dl}")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines).strip() + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_global_document(doc_text: str, metadata: Dict[str, Any]) -> str:
|
|
|
|
|
|
"""Формирует версию документа для глобального (межпроектного) индекса.
|
|
|
|
|
|
|
|
|
|
|
|
Добавляет явное указание проекта в начало, чтобы глобальный граф знал связь.
|
|
|
|
|
|
"""
|
|
|
|
|
|
header = f"""=== СОВЕЩАНИЕ (Проект: {metadata.get('project', 'unknown')}) ===
|
|
|
|
|
|
Раздел: {metadata.get('section', 'Общие вопросы')}
|
|
|
|
|
|
Тема: {metadata.get('topic', 'Не определена')}
|
|
|
|
|
|
"""
|
|
|
|
|
|
return header + "\n" + doc_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_meeting_text_only(segments: List[Dict[str, Any]]) -> str:
|
|
|
|
|
|
"""Собирает plain text из сегментов (для отправки в classify_meeting)."""
|
|
|
|
|
|
lines = []
|
|
|
|
|
|
for seg in segments:
|
|
|
|
|
|
speaker = seg.get("speaker", "UNKNOWN")
|
|
|
|
|
|
text = seg.get("text", "").strip()
|
|
|
|
|
|
if text:
|
|
|
|
|
|
lines.append(f"{speaker}: {text}")
|
|
|
|
|
|
return "\n".join(lines)
|