#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Тестовый скрипт распознавания PDF (чертежи / документы) Стек: PyMuPDF (рендер) + RapidOCR (текст+координаты) + Docling (структура) Работает на CPU. """ import os import sys import json import fitz # PyMuPDF from pathlib import Path # ------------------------------------------------------------------ # 1. RapidOCR - лёгкий ONNX-OCR (входит в зависимости Docling) # ------------------------------------------------------------------ try: from rapidocr import RapidOCR engine = RapidOCR() print("[OK] RapidOCR загружен") except Exception as e: print(f"[ERR] RapidOCR: {e}") engine = None # ------------------------------------------------------------------ # 2. Docling - структурный анализ PDF # ------------------------------------------------------------------ try: from docling.document_converter import DocumentConverter converter = DocumentConverter() print("[OK] Docling загружен") except Exception as e: print(f"[ERR] Docling: {e}") converter = None # ------------------------------------------------------------------ # 3. Конфигурация # ------------------------------------------------------------------ PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf" PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME OUT_DIR = Path(r"D:\TEST docs\output") OUT_DIR.mkdir(exist_ok=True) MAX_PAGES_OCR = 10 # test mode: first 10 pages DPI = 300 # разрешение рендера страниц # ------------------------------------------------------------------ # 4. Рендер страниц PDF -> PNG (PyMuPDF) # ------------------------------------------------------------------ def render_pages(pdf_path: Path, out_dir: Path, max_pages: int = None, dpi: int = 300): doc = fitz.open(pdf_path) images = [] total = min(max_pages, len(doc)) if max_pages else len(doc) for i in range(total): page = doc.load_page(i) # матрица для заданного DPI (72 dpi по умолчанию в PDF) mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) img_path = out_dir / f"page_{i+1:03d}.png" pix.save(img_path) images.append(img_path) print(f" -> страница {i+1} отрендерена ({img_path.name})") doc.close() return images # ------------------------------------------------------------------ # 5. OCR - извлечение текста с координатами (RapidOCR) # ------------------------------------------------------------------ def run_ocr(image_paths: list[Path]): if engine is None: print("[WARN] OCR-dvigok nedostupen, propuskaem") return {} results = {} for img_path in image_paths: print(f" OCR: {img_path.name} ...") res = engine(img_path) # res - RapidOCROutput s atributami txts, boxes, scores entries = [] for txt, box, score in zip(res.txts, res.boxes, res.scores): entries.append({ "text": txt, "confidence": float(score), "bbox": box.tolist() if hasattr(box, "tolist") else box }) results[img_path.name] = entries print(f" naydeno {len(entries)} strok") return results # ------------------------------------------------------------------ # 6. Docling - структурный анализ (текстовый слой PDF) # ------------------------------------------------------------------ def run_docling(pdf_path: Path, max_pages: int = 10): if converter is None: print("[WARN] Docling nedostupen, propuskaem") return None import shutil, tempfile, os # Docling sometimes fails on cyrillic paths -> copy to temp latin path # Also limit pages to avoid std::bad_alloc on CPU with large PDFs tmp_dir = tempfile.mkdtemp(prefix="docling_") tmp_path = Path(tmp_dir) / "input.pdf" doc = fitz.open(pdf_path) if len(doc) > max_pages: new_doc = fitz.open() for i in range(min(max_pages, len(doc))): new_doc.insert_pdf(doc, from_page=i, to_page=i) new_doc.save(tmp_path) new_doc.close() else: shutil.copy2(pdf_path, tmp_path) doc.close() print(f" Docling: obrabotka (temp copy, first {max_pages} pages) ...") try: result = converter.convert(tmp_path) doc_out = result.document md = doc_out.export_to_markdown() return md except Exception as e: print(f"[ERR] Docling: {e}") return None finally: try: os.remove(tmp_path) os.rmdir(tmp_dir) except Exception: pass print(f" Docling: обработка {pdf_path.name} ...") try: result = converter.convert(pdf_path) doc = result.document md = doc.export_to_markdown() return md except Exception as e: print(f"[ERR] Docling: {e}") return None # ------------------------------------------------------------------ # 7. Кастомный парсер - простая эвристика для чертежей # ------------------------------------------------------------------ def parse_drawing_text(ocr_data: dict): """ Ищет в OCR-результатах признаки этажей, осей, размеров. Возвращает словарь с найденными сущностями. """ import re found = { "floors": set(), "axes": set(), "dimensions": set(), "rooms": set(), } floor_patterns = [ re.compile(r"(\d+)[-\s]?й\s*этаж", re.I), re.compile(r"этаж\s*(\d+)", re.I), re.compile(r"этаж\s*([А-Я]\d?)", re.I), re.compile(r"НК\d", re.I), ] axis_patterns = [ re.compile(r"\b([А-Я])\b"), # одиночная буква re.compile(r"\b(\d{1,2})\b"), # одиночная цифра/двузначное re.compile(r"([А-Я])\s*[-–]\s*([А-Я])"), # А-Б ] dim_patterns = [ re.compile(r"\b(\d{3,5})\s*м?[мм]?\b"), # 3600, 5400 и т.д. ] room_patterns = [ re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+)", re.I), re.compile(r"([Пп]омещение)\s*(\d+)", re.I), ] for img_name, lines in ocr_data.items(): for entry in lines: text = entry["text"].strip() # этажи for pat in floor_patterns: m = pat.search(text) if m: found["floors"].add(m.group(0)) # оси for pat in axis_patterns: for m in pat.finditer(text): g = m.group(0).replace(" ","").replace("-","").replace("–","") if g and len(g) <= 3: found["axes"].add(g) # размеры for pat in dim_patterns: for m in pat.finditer(text): val = m.group(1) if 100 <= int(val) <= 50000: found["dimensions"].add(val) # помещения for pat in room_patterns: m = pat.search(text) if m: found["rooms"].add(m.group(0)) # очистка ложных осей (слишком общие цифры) axes_clean = {a for a in found["axes"] if a.isalpha() or (a.isdigit() and 1 <= int(a) <= 50)} found["axes"] = axes_clean return {k: sorted(v) for k, v in found.items()} # ------------------------------------------------------------------ # 8. main # ------------------------------------------------------------------ def main(): if not PDF_PATH.exists(): print(f"[ERR] Файл не найден: {PDF_PATH}") sys.exit(1) doc_tmp = fitz.open(PDF_PATH) page_count = doc_tmp.page_count doc_tmp.close() print(f"\n=== PDF: {PDF_PATH.name} ===") print(f"Vsego stranits: {page_count}") # --- 8.1 Render stranits --- print("\n[1/4] Render stranits v izobrazheniya...") images = render_pages(PDF_PATH, OUT_DIR, MAX_PAGES_OCR, DPI) # --- 8.2 OCR --- print("\n[2/4] OCR (RapidOCR)...") ocr_results = run_ocr(images) ocr_json = OUT_DIR / "ocr_results.json" with open(ocr_json, "w", encoding="utf-8") as f: json.dump(ocr_results, f, ensure_ascii=False, indent=2) print(f" -> sohraneno {ocr_json}") # --- 8.3 Docling --- print("\n[3/4] Strukturnyy analiz (Docling)...") md_text = run_docling(PDF_PATH, max_pages=MAX_PAGES_OCR) if md_text: md_path = OUT_DIR / "docling_output.md" with open(md_path, "w", encoding="utf-8") as f: f.write(md_text) print(f" -> sohraneno {md_path}") else: print(" Docling ne dal rezultata") # --- 8.4 Custom parser --- print("\n[4/4] Custom parser chertezha...") parsed = parse_drawing_text(ocr_results) parsed_json = OUT_DIR / "parsed_entities.json" with open(parsed_json, "w", encoding="utf-8") as f: json.dump(parsed, f, ensure_ascii=False, indent=2) print(f" -> sohraneno {parsed_json}") print("\n--- Naydennye sushchnosti ---") for k, v in parsed.items(): print(f" {k}: {v}") print(f"\n=== Gotovo. Rezultaty v {OUT_DIR} ===") if __name__ == "__main__": main()