120 lines
4.5 KiB
Python
120 lines
4.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Сборка базы (индекса) проекта из full_ocr_results.json в заданной папке.
|
|||
|
|
Использование:
|
|||
|
|
python build_index.py <input_folder>
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
def build_index_for_folder(folder: Path):
|
|||
|
|
ocr_path = folder / "full_ocr_results.json"
|
|||
|
|
if not ocr_path.exists():
|
|||
|
|
print(f"[ERR] Не найден {ocr_path}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
|||
|
|
pages = data["pages"]
|
|||
|
|
index_path = folder / "project_index.json"
|
|||
|
|
|
|||
|
|
# --- метаданные из текстового слоя первых страниц ---
|
|||
|
|
meta = {}
|
|||
|
|
front_text = "\n".join(p["pdf_text_layer"] for p in pages[:3])
|
|||
|
|
m = re.search(r'(180-[^\s\n]+)', front_text)
|
|||
|
|
if m: meta["document_number"] = m.group(1)
|
|||
|
|
m = re.search(r'Заказчик[:\s]+([^\n]+)', front_text)
|
|||
|
|
if m: meta["customer"] = m.group(1).strip()
|
|||
|
|
m = re.search(r'(\d{2}\.\d{2}\.\d{2,4})', front_text)
|
|||
|
|
if m: meta["date"] = m.group(1)
|
|||
|
|
m = re.search(r'Главный инженер проекта\s*\n?\s*([^\n()]+)', front_text)
|
|||
|
|
if m: meta["chief_engineer"] = m.group(1).strip()
|
|||
|
|
|
|||
|
|
# --- оглавление ---
|
|||
|
|
toc = []
|
|||
|
|
toc_text = "\n".join(p["pdf_text_layer"] for p in pages[2:6])
|
|||
|
|
for line in toc_text.splitlines():
|
|||
|
|
line = line.strip()
|
|||
|
|
m = re.search(r'^(.+?)\s*\.\.\.\.?\s*(\d+)\s*$', line)
|
|||
|
|
if m:
|
|||
|
|
title = m.group(1).strip()
|
|||
|
|
page = int(m.group(2))
|
|||
|
|
if len(title) > 5 and not title.startswith("180-"):
|
|||
|
|
toc.append({"title": title, "page": page})
|
|||
|
|
|
|||
|
|
# --- индекс страниц ---
|
|||
|
|
page_index = []
|
|||
|
|
for p in pages:
|
|||
|
|
ocr_text = "\n".join(l["text"] for l in p["ocr_lines"])
|
|||
|
|
preview = ocr_text[:400] + "..." if len(ocr_text) > 400 else ocr_text
|
|||
|
|
page_index.append({
|
|||
|
|
"page_number": p["page_number"],
|
|||
|
|
"image": p["image"],
|
|||
|
|
"line_count": p["ocr_line_count"],
|
|||
|
|
"pdf_text_preview": p["pdf_text_layer"][:300] if p["pdf_text_layer"] else "",
|
|||
|
|
"ocr_text_preview": preview,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# --- сущности ---
|
|||
|
|
floors, axes, dims, rooms, dates, docs = set(), set(), set(), set(), set(), set()
|
|||
|
|
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
|
|||
|
|
axis_re = re.compile(r"\b([А-Я])\b")
|
|||
|
|
num_axis_re = re.compile(r"\b(\d{1,2})\b")
|
|||
|
|
dim_re = re.compile(r"\b(\d{3,5})\b")
|
|||
|
|
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
|
|||
|
|
date_re = re.compile(r"\b(\d{2}\.\d{2}\.\d{2,4})\b")
|
|||
|
|
doc_re = re.compile(r"(180-[^\s\n]+)")
|
|||
|
|
|
|||
|
|
for p in pages:
|
|||
|
|
for entry in p["ocr_lines"]:
|
|||
|
|
t = entry["text"]
|
|||
|
|
for m in floor_re.finditer(t): floors.add(m.group(0))
|
|||
|
|
for m in axis_re.finditer(t): axes.add(m.group(1))
|
|||
|
|
for m in num_axis_re.finditer(t):
|
|||
|
|
v = m.group(1)
|
|||
|
|
if v.isdigit() and 1 <= int(v) <= 50: axes.add(v)
|
|||
|
|
for m in dim_re.finditer(t):
|
|||
|
|
v = m.group(1)
|
|||
|
|
if 100 <= int(v) <= 20000: dims.add(v)
|
|||
|
|
for m in room_re.finditer(t): rooms.add(m.group(0))
|
|||
|
|
for m in date_re.finditer(t): dates.add(m.group(1))
|
|||
|
|
for m in doc_re.finditer(t): docs.add(m.group(1))
|
|||
|
|
|
|||
|
|
entities = {
|
|||
|
|
"floors": sorted(floors), "axes": sorted(axes),
|
|||
|
|
"dimensions": sorted(dims), "rooms": sorted(rooms),
|
|||
|
|
"dates": sorted(dates), "document_refs": sorted(docs),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
index = {
|
|||
|
|
"project": {
|
|||
|
|
"document_number": meta.get("document_number"),
|
|||
|
|
"customer": meta.get("customer"),
|
|||
|
|
"date": meta.get("date"),
|
|||
|
|
"chief_engineer": meta.get("chief_engineer"),
|
|||
|
|
"total_pdf_pages": len(pages),
|
|||
|
|
"ocr_processed_pages": len(pages),
|
|||
|
|
},
|
|||
|
|
"table_of_contents": toc,
|
|||
|
|
"pages": page_index,
|
|||
|
|
"entities": entities,
|
|||
|
|
"sources": {"ocr_json": str(ocr_path)}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(index, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"[build_index] Saved -> {index_path}")
|
|||
|
|
print(f" Pages: {len(page_index)}, TOC: {len(toc)}")
|
|||
|
|
print(f" Entities: { {k: len(v) for k,v in entities.items()} }")
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(r"D:\TEST docs\output_123")
|
|||
|
|
build_index_for_folder(folder)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|