#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Сборка базы (индекса) проекта из full_ocr_results.json в заданной папке. Использование: python build_index.py """ import sys import json import re from pathlib import Path def build_index_for_folder(folder: Path): ocr_path = folder / "full_ocr_results.json" if not ocr_path.exists(): print(f"[ERR] Не найден {ocr_path}") return data = json.loads(ocr_path.read_text(encoding="utf-8")) pages = data["pages"] index_path = folder / "project_index.json" # --- метаданные из текстового слоя первых страниц --- meta = {} front_text = "\n".join(p["pdf_text_layer"] for p in pages[:3]) m = re.search(r'(180-[^\s\n]+)', front_text) if m: meta["document_number"] = m.group(1) m = re.search(r'Заказчик[:\s]+([^\n]+)', front_text) if m: meta["customer"] = m.group(1).strip() m = re.search(r'(\d{2}\.\d{2}\.\d{2,4})', front_text) if m: meta["date"] = m.group(1) m = re.search(r'Главный инженер проекта\s*\n?\s*([^\n()]+)', front_text) if m: meta["chief_engineer"] = m.group(1).strip() # --- оглавление --- toc = [] toc_text = "\n".join(p["pdf_text_layer"] for p in pages[2:6]) for line in toc_text.splitlines(): line = line.strip() m = re.search(r'^(.+?)\s*\.\.\.\.?\s*(\d+)\s*$', line) if m: title = m.group(1).strip() page = int(m.group(2)) if len(title) > 5 and not title.startswith("180-"): toc.append({"title": title, "page": page}) # --- индекс страниц --- page_index = [] for p in pages: ocr_text = "\n".join(l["text"] for l in p["ocr_lines"]) preview = ocr_text[:400] + "..." if len(ocr_text) > 400 else ocr_text page_index.append({ "page_number": p["page_number"], "image": p["image"], "line_count": p["ocr_line_count"], "pdf_text_preview": p["pdf_text_layer"][:300] if p["pdf_text_layer"] else "", "ocr_text_preview": preview, }) # --- сущности --- floors, axes, dims, rooms, dates, docs = set(), set(), set(), set(), set(), set() floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I) axis_re = re.compile(r"\b([А-Я])\b") num_axis_re = re.compile(r"\b(\d{1,2})\b") dim_re = re.compile(r"\b(\d{3,5})\b") room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I) date_re = re.compile(r"\b(\d{2}\.\d{2}\.\d{2,4})\b") doc_re = re.compile(r"(180-[^\s\n]+)") for p in pages: for entry in p["ocr_lines"]: t = entry["text"] for m in floor_re.finditer(t): floors.add(m.group(0)) for m in axis_re.finditer(t): axes.add(m.group(1)) for m in num_axis_re.finditer(t): v = m.group(1) if v.isdigit() and 1 <= int(v) <= 50: axes.add(v) for m in dim_re.finditer(t): v = m.group(1) if 100 <= int(v) <= 20000: dims.add(v) for m in room_re.finditer(t): rooms.add(m.group(0)) for m in date_re.finditer(t): dates.add(m.group(1)) for m in doc_re.finditer(t): docs.add(m.group(1)) entities = { "floors": sorted(floors), "axes": sorted(axes), "dimensions": sorted(dims), "rooms": sorted(rooms), "dates": sorted(dates), "document_refs": sorted(docs), } index = { "project": { "document_number": meta.get("document_number"), "customer": meta.get("customer"), "date": meta.get("date"), "chief_engineer": meta.get("chief_engineer"), "total_pdf_pages": len(pages), "ocr_processed_pages": len(pages), }, "table_of_contents": toc, "pages": page_index, "entities": entities, "sources": {"ocr_json": str(ocr_path)} } with open(index_path, "w", encoding="utf-8") as f: json.dump(index, f, ensure_ascii=False, indent=2) print(f"[build_index] Saved -> {index_path}") print(f" Pages: {len(page_index)}, TOC: {len(toc)}") print(f" Entities: { {k: len(v) for k,v in entities.items()} }") def main(): folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(r"D:\TEST docs\output_123") build_index_for_folder(folder) if __name__ == "__main__": main()