transcription/scripts/backfill_summaries.py

122 lines
4.1 KiB
Python
Raw Normal View History

"""Перегенерация пустых или отсутствующих summary для уже обработанных совещаний."""
import argparse
import asyncio
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.paths import PROCESSED_ROOT, MEETINGS_DIRNAME
from src.config import load_config, resolve_opencode_credentials
from src.rag.formatter import build_meeting_text_only, format_summary_markdown
from src.rag.parser import classify_meeting, generate_meeting_brief
def _summary_body(summary_path: Path) -> str:
if not summary_path.exists():
return ""
text = summary_path.read_text(encoding="utf-8")
if "## Суть" not in text:
return ""
return text.split("## Суть", 1)[-1].strip()
def _find_meeting_jobs(org_slug: str) -> list[dict]:
meetings_dir = PROCESSED_ROOT / org_slug / MEETINGS_DIRNAME
if not meetings_dir.exists():
return []
jobs = []
for folder in sorted(meetings_dir.iterdir()):
if not folder.is_dir():
continue
segments_files = list(folder.glob("*_segments.json"))
if not segments_files:
continue
segments_path = segments_files[0]
stem = segments_path.name[: -len("_segments.json")]
summary_path = folder / f"{stem}_summary.md"
jobs.append({
"folder": folder,
"stem": stem,
"segments_path": segments_path,
"summary_path": summary_path,
"display_name": f"{stem}.webm",
})
return jobs
async def _regenerate_job(job: dict, config: dict, dry_run: bool) -> bool:
body = _summary_body(job["summary_path"])
if body:
return False
segments = json.loads(job["segments_path"].read_text(encoding="utf-8"))
meeting_text = build_meeting_text_only(segments)
rag_cfg = config.get("rag", {})
api_key, base_url = resolve_opencode_credentials(config)
meta_path = job["folder"] / ".project.json"
project = "unknown"
if meta_path.exists():
try:
project = json.loads(meta_path.read_text(encoding="utf-8")).get("project_slug", project)
except Exception:
pass
if not api_key:
print(f"[skip] {job['folder'].name}: нет API-ключа")
return False
if dry_run:
print(f"[dry-run] {job['folder'].name}")
return True
sections = rag_cfg.get("sections", ["Общие вопросы"])
metadata = await classify_meeting(
text=meeting_text,
project=project,
sections=sections,
api_key=api_key,
base_url=base_url,
model=rag_cfg.get("index_model", "mimo-v2.5-free"),
chunk_size=int(rag_cfg.get("classify_chunk_size", 7000)),
)
brief = await generate_meeting_brief(
text=meeting_text,
metadata=metadata,
api_key=api_key,
base_url=base_url,
model=rag_cfg.get("summary_model", "deepseek-v4-flash-free"),
chunk_size=int(rag_cfg.get("summary_chunk_size", 10000)),
)
if not (brief or "").strip():
print(f"[warn] {job['folder'].name}: LLM вернул пустой brief")
return False
summary_md = format_summary_markdown(metadata, brief, job["display_name"])
job["summary_path"].write_text(summary_md, encoding="utf-8")
print(f"[ok] {job['folder'].name} ({len(brief)} символов)")
return True
async def main():
parser = argparse.ArgumentParser(description="Backfill пустых summary совещаний")
parser.add_argument("--org", default="merakom", help="org_slug")
parser.add_argument("--dry-run", action="store_true", help="Только показать, что будет обновлено")
args = parser.parse_args()
config = load_config()
jobs = _find_meeting_jobs(args.org)
updated = 0
for job in jobs:
if await _regenerate_job(job, config, args.dry_run):
updated += 1
print(f"Готово: {updated} из {len(jobs)} совещаний")
if __name__ == "__main__":
asyncio.run(main())