122 lines
4.1 KiB
Python
122 lines
4.1 KiB
Python
|
|
"""Перегенерация пустых или отсутствующих summary для уже обработанных совещаний."""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
|
|
||
|
|
from backend.paths import PROCESSED_ROOT, MEETINGS_DIRNAME
|
||
|
|
from src.config import load_config, resolve_opencode_credentials
|
||
|
|
from src.rag.formatter import build_meeting_text_only, format_summary_markdown
|
||
|
|
from src.rag.parser import classify_meeting, generate_meeting_brief
|
||
|
|
|
||
|
|
|
||
|
|
def _summary_body(summary_path: Path) -> str:
|
||
|
|
if not summary_path.exists():
|
||
|
|
return ""
|
||
|
|
text = summary_path.read_text(encoding="utf-8")
|
||
|
|
if "## Суть" not in text:
|
||
|
|
return ""
|
||
|
|
return text.split("## Суть", 1)[-1].strip()
|
||
|
|
|
||
|
|
|
||
|
|
def _find_meeting_jobs(org_slug: str) -> list[dict]:
|
||
|
|
meetings_dir = PROCESSED_ROOT / org_slug / MEETINGS_DIRNAME
|
||
|
|
if not meetings_dir.exists():
|
||
|
|
return []
|
||
|
|
|
||
|
|
jobs = []
|
||
|
|
for folder in sorted(meetings_dir.iterdir()):
|
||
|
|
if not folder.is_dir():
|
||
|
|
continue
|
||
|
|
segments_files = list(folder.glob("*_segments.json"))
|
||
|
|
if not segments_files:
|
||
|
|
continue
|
||
|
|
segments_path = segments_files[0]
|
||
|
|
stem = segments_path.name[: -len("_segments.json")]
|
||
|
|
summary_path = folder / f"{stem}_summary.md"
|
||
|
|
jobs.append({
|
||
|
|
"folder": folder,
|
||
|
|
"stem": stem,
|
||
|
|
"segments_path": segments_path,
|
||
|
|
"summary_path": summary_path,
|
||
|
|
"display_name": f"{stem}.webm",
|
||
|
|
})
|
||
|
|
return jobs
|
||
|
|
|
||
|
|
|
||
|
|
async def _regenerate_job(job: dict, config: dict, dry_run: bool) -> bool:
|
||
|
|
body = _summary_body(job["summary_path"])
|
||
|
|
if body:
|
||
|
|
return False
|
||
|
|
|
||
|
|
segments = json.loads(job["segments_path"].read_text(encoding="utf-8"))
|
||
|
|
meeting_text = build_meeting_text_only(segments)
|
||
|
|
rag_cfg = config.get("rag", {})
|
||
|
|
api_key, base_url = resolve_opencode_credentials(config)
|
||
|
|
|
||
|
|
meta_path = job["folder"] / ".project.json"
|
||
|
|
project = "unknown"
|
||
|
|
if meta_path.exists():
|
||
|
|
try:
|
||
|
|
project = json.loads(meta_path.read_text(encoding="utf-8")).get("project_slug", project)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
if not api_key:
|
||
|
|
print(f"[skip] {job['folder'].name}: нет API-ключа")
|
||
|
|
return False
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
print(f"[dry-run] {job['folder'].name}")
|
||
|
|
return True
|
||
|
|
|
||
|
|
sections = rag_cfg.get("sections", ["Общие вопросы"])
|
||
|
|
metadata = await classify_meeting(
|
||
|
|
text=meeting_text,
|
||
|
|
project=project,
|
||
|
|
sections=sections,
|
||
|
|
api_key=api_key,
|
||
|
|
base_url=base_url,
|
||
|
|
model=rag_cfg.get("index_model", "mimo-v2.5-free"),
|
||
|
|
chunk_size=int(rag_cfg.get("classify_chunk_size", 7000)),
|
||
|
|
)
|
||
|
|
brief = await generate_meeting_brief(
|
||
|
|
text=meeting_text,
|
||
|
|
metadata=metadata,
|
||
|
|
api_key=api_key,
|
||
|
|
base_url=base_url,
|
||
|
|
model=rag_cfg.get("summary_model", "deepseek-v4-flash-free"),
|
||
|
|
chunk_size=int(rag_cfg.get("summary_chunk_size", 10000)),
|
||
|
|
)
|
||
|
|
if not (brief or "").strip():
|
||
|
|
print(f"[warn] {job['folder'].name}: LLM вернул пустой brief")
|
||
|
|
return False
|
||
|
|
|
||
|
|
summary_md = format_summary_markdown(metadata, brief, job["display_name"])
|
||
|
|
job["summary_path"].write_text(summary_md, encoding="utf-8")
|
||
|
|
print(f"[ok] {job['folder'].name} ({len(brief)} символов)")
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Backfill пустых summary совещаний")
|
||
|
|
parser.add_argument("--org", default="merakom", help="org_slug")
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Только показать, что будет обновлено")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
config = load_config()
|
||
|
|
jobs = _find_meeting_jobs(args.org)
|
||
|
|
updated = 0
|
||
|
|
for job in jobs:
|
||
|
|
if await _regenerate_job(job, config, args.dry_run):
|
||
|
|
updated += 1
|
||
|
|
print(f"Готово: {updated} из {len(jobs)} совещаний")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|