120 lines
4.5 KiB
Python
120 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Сборка базы (индекса) проекта из full_ocr_results.json в заданной папке.
|
||
Использование:
|
||
python build_index.py <input_folder>
|
||
"""
|
||
|
||
import sys
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
|
||
def build_index_for_folder(folder: Path):
|
||
ocr_path = folder / "full_ocr_results.json"
|
||
if not ocr_path.exists():
|
||
print(f"[ERR] Не найден {ocr_path}")
|
||
return
|
||
|
||
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
||
pages = data["pages"]
|
||
index_path = folder / "project_index.json"
|
||
|
||
# --- метаданные из текстового слоя первых страниц ---
|
||
meta = {}
|
||
front_text = "\n".join(p["pdf_text_layer"] for p in pages[:3])
|
||
m = re.search(r'(180-[^\s\n]+)', front_text)
|
||
if m: meta["document_number"] = m.group(1)
|
||
m = re.search(r'Заказчик[:\s]+([^\n]+)', front_text)
|
||
if m: meta["customer"] = m.group(1).strip()
|
||
m = re.search(r'(\d{2}\.\d{2}\.\d{2,4})', front_text)
|
||
if m: meta["date"] = m.group(1)
|
||
m = re.search(r'Главный инженер проекта\s*\n?\s*([^\n()]+)', front_text)
|
||
if m: meta["chief_engineer"] = m.group(1).strip()
|
||
|
||
# --- оглавление ---
|
||
toc = []
|
||
toc_text = "\n".join(p["pdf_text_layer"] for p in pages[2:6])
|
||
for line in toc_text.splitlines():
|
||
line = line.strip()
|
||
m = re.search(r'^(.+?)\s*\.\.\.\.?\s*(\d+)\s*$', line)
|
||
if m:
|
||
title = m.group(1).strip()
|
||
page = int(m.group(2))
|
||
if len(title) > 5 and not title.startswith("180-"):
|
||
toc.append({"title": title, "page": page})
|
||
|
||
# --- индекс страниц ---
|
||
page_index = []
|
||
for p in pages:
|
||
ocr_text = "\n".join(l["text"] for l in p["ocr_lines"])
|
||
preview = ocr_text[:400] + "..." if len(ocr_text) > 400 else ocr_text
|
||
page_index.append({
|
||
"page_number": p["page_number"],
|
||
"image": p["image"],
|
||
"line_count": p["ocr_line_count"],
|
||
"pdf_text_preview": p["pdf_text_layer"][:300] if p["pdf_text_layer"] else "",
|
||
"ocr_text_preview": preview,
|
||
})
|
||
|
||
# --- сущности ---
|
||
floors, axes, dims, rooms, dates, docs = set(), set(), set(), set(), set(), set()
|
||
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
|
||
axis_re = re.compile(r"\b([А-Я])\b")
|
||
num_axis_re = re.compile(r"\b(\d{1,2})\b")
|
||
dim_re = re.compile(r"\b(\d{3,5})\b")
|
||
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
|
||
date_re = re.compile(r"\b(\d{2}\.\d{2}\.\d{2,4})\b")
|
||
doc_re = re.compile(r"(180-[^\s\n]+)")
|
||
|
||
for p in pages:
|
||
for entry in p["ocr_lines"]:
|
||
t = entry["text"]
|
||
for m in floor_re.finditer(t): floors.add(m.group(0))
|
||
for m in axis_re.finditer(t): axes.add(m.group(1))
|
||
for m in num_axis_re.finditer(t):
|
||
v = m.group(1)
|
||
if v.isdigit() and 1 <= int(v) <= 50: axes.add(v)
|
||
for m in dim_re.finditer(t):
|
||
v = m.group(1)
|
||
if 100 <= int(v) <= 20000: dims.add(v)
|
||
for m in room_re.finditer(t): rooms.add(m.group(0))
|
||
for m in date_re.finditer(t): dates.add(m.group(1))
|
||
for m in doc_re.finditer(t): docs.add(m.group(1))
|
||
|
||
entities = {
|
||
"floors": sorted(floors), "axes": sorted(axes),
|
||
"dimensions": sorted(dims), "rooms": sorted(rooms),
|
||
"dates": sorted(dates), "document_refs": sorted(docs),
|
||
}
|
||
|
||
index = {
|
||
"project": {
|
||
"document_number": meta.get("document_number"),
|
||
"customer": meta.get("customer"),
|
||
"date": meta.get("date"),
|
||
"chief_engineer": meta.get("chief_engineer"),
|
||
"total_pdf_pages": len(pages),
|
||
"ocr_processed_pages": len(pages),
|
||
},
|
||
"table_of_contents": toc,
|
||
"pages": page_index,
|
||
"entities": entities,
|
||
"sources": {"ocr_json": str(ocr_path)}
|
||
}
|
||
|
||
with open(index_path, "w", encoding="utf-8") as f:
|
||
json.dump(index, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"[build_index] Saved -> {index_path}")
|
||
print(f" Pages: {len(page_index)}, TOC: {len(toc)}")
|
||
print(f" Entities: { {k: len(v) for k,v in entities.items()} }")
|
||
|
||
def main():
|
||
folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(r"D:\TEST docs\output_123")
|
||
build_index_for_folder(folder)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|