opencode/build_index.py

120 lines
4.5 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Сборка базы (индекса) проекта из full_ocr_results.json в заданной папке.
Использование:
python build_index.py <input_folder>
"""
import sys
import json
import re
from pathlib import Path
def build_index_for_folder(folder: Path):
ocr_path = folder / "full_ocr_results.json"
if not ocr_path.exists():
print(f"[ERR] Не найден {ocr_path}")
return
data = json.loads(ocr_path.read_text(encoding="utf-8"))
pages = data["pages"]
index_path = folder / "project_index.json"
# --- метаданные из текстового слоя первых страниц ---
meta = {}
front_text = "\n".join(p["pdf_text_layer"] for p in pages[:3])
m = re.search(r'(180-[^\s\n]+)', front_text)
if m: meta["document_number"] = m.group(1)
m = re.search(r'Заказчик[:\s]+([^\n]+)', front_text)
if m: meta["customer"] = m.group(1).strip()
m = re.search(r'(\d{2}\.\d{2}\.\d{2,4})', front_text)
if m: meta["date"] = m.group(1)
m = re.search(r'Главный инженер проекта\s*\n?\s*([^\n()]+)', front_text)
if m: meta["chief_engineer"] = m.group(1).strip()
# --- оглавление ---
toc = []
toc_text = "\n".join(p["pdf_text_layer"] for p in pages[2:6])
for line in toc_text.splitlines():
line = line.strip()
m = re.search(r'^(.+?)\s*\.\.\.\.?\s*(\d+)\s*$', line)
if m:
title = m.group(1).strip()
page = int(m.group(2))
if len(title) > 5 and not title.startswith("180-"):
toc.append({"title": title, "page": page})
# --- индекс страниц ---
page_index = []
for p in pages:
ocr_text = "\n".join(l["text"] for l in p["ocr_lines"])
preview = ocr_text[:400] + "..." if len(ocr_text) > 400 else ocr_text
page_index.append({
"page_number": p["page_number"],
"image": p["image"],
"line_count": p["ocr_line_count"],
"pdf_text_preview": p["pdf_text_layer"][:300] if p["pdf_text_layer"] else "",
"ocr_text_preview": preview,
})
# --- сущности ---
floors, axes, dims, rooms, dates, docs = set(), set(), set(), set(), set(), set()
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
axis_re = re.compile(r"\b([А-Я])\b")
num_axis_re = re.compile(r"\b(\d{1,2})\b")
dim_re = re.compile(r"\b(\d{3,5})\b")
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
date_re = re.compile(r"\b(\d{2}\.\d{2}\.\d{2,4})\b")
doc_re = re.compile(r"(180-[^\s\n]+)")
for p in pages:
for entry in p["ocr_lines"]:
t = entry["text"]
for m in floor_re.finditer(t): floors.add(m.group(0))
for m in axis_re.finditer(t): axes.add(m.group(1))
for m in num_axis_re.finditer(t):
v = m.group(1)
if v.isdigit() and 1 <= int(v) <= 50: axes.add(v)
for m in dim_re.finditer(t):
v = m.group(1)
if 100 <= int(v) <= 20000: dims.add(v)
for m in room_re.finditer(t): rooms.add(m.group(0))
for m in date_re.finditer(t): dates.add(m.group(1))
for m in doc_re.finditer(t): docs.add(m.group(1))
entities = {
"floors": sorted(floors), "axes": sorted(axes),
"dimensions": sorted(dims), "rooms": sorted(rooms),
"dates": sorted(dates), "document_refs": sorted(docs),
}
index = {
"project": {
"document_number": meta.get("document_number"),
"customer": meta.get("customer"),
"date": meta.get("date"),
"chief_engineer": meta.get("chief_engineer"),
"total_pdf_pages": len(pages),
"ocr_processed_pages": len(pages),
},
"table_of_contents": toc,
"pages": page_index,
"entities": entities,
"sources": {"ocr_json": str(ocr_path)}
}
with open(index_path, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
print(f"[build_index] Saved -> {index_path}")
print(f" Pages: {len(page_index)}, TOC: {len(toc)}")
print(f" Entities: { {k: len(v) for k,v in entities.items()} }")
def main():
folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(r"D:\TEST docs\output_123")
build_index_for_folder(folder)
if __name__ == "__main__":
main()