259 lines
9.4 KiB
Python
259 lines
9.4 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Тестовый скрипт распознавания PDF (чертежи / документы)
|
|||
|
|
Стек: PyMuPDF (рендер) + RapidOCR (текст+координаты) + Docling (структура)
|
|||
|
|
Работает на CPU.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import fitz # PyMuPDF
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 1. RapidOCR - лёгкий ONNX-OCR (входит в зависимости Docling)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
try:
|
|||
|
|
from rapidocr import RapidOCR
|
|||
|
|
engine = RapidOCR()
|
|||
|
|
print("[OK] RapidOCR загружен")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ERR] RapidOCR: {e}")
|
|||
|
|
engine = None
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 2. Docling - структурный анализ PDF
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
try:
|
|||
|
|
from docling.document_converter import DocumentConverter
|
|||
|
|
converter = DocumentConverter()
|
|||
|
|
print("[OK] Docling загружен")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ERR] Docling: {e}")
|
|||
|
|
converter = None
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 3. Конфигурация
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
|
|||
|
|
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
|
|||
|
|
OUT_DIR = Path(r"D:\TEST docs\output")
|
|||
|
|
OUT_DIR.mkdir(exist_ok=True)
|
|||
|
|
|
|||
|
|
MAX_PAGES_OCR = 10 # test mode: first 10 pages
|
|||
|
|
DPI = 300 # разрешение рендера страниц
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 4. Рендер страниц PDF -> PNG (PyMuPDF)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def render_pages(pdf_path: Path, out_dir: Path, max_pages: int = None, dpi: int = 300):
|
|||
|
|
doc = fitz.open(pdf_path)
|
|||
|
|
images = []
|
|||
|
|
total = min(max_pages, len(doc)) if max_pages else len(doc)
|
|||
|
|
for i in range(total):
|
|||
|
|
page = doc.load_page(i)
|
|||
|
|
# матрица для заданного DPI (72 dpi по умолчанию в PDF)
|
|||
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|||
|
|
pix = page.get_pixmap(matrix=mat)
|
|||
|
|
img_path = out_dir / f"page_{i+1:03d}.png"
|
|||
|
|
pix.save(img_path)
|
|||
|
|
images.append(img_path)
|
|||
|
|
print(f" -> страница {i+1} отрендерена ({img_path.name})")
|
|||
|
|
doc.close()
|
|||
|
|
return images
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 5. OCR - извлечение текста с координатами (RapidOCR)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def run_ocr(image_paths: list[Path]):
|
|||
|
|
if engine is None:
|
|||
|
|
print("[WARN] OCR-dvigok nedostupen, propuskaem")
|
|||
|
|
return {}
|
|||
|
|
results = {}
|
|||
|
|
for img_path in image_paths:
|
|||
|
|
print(f" OCR: {img_path.name} ...")
|
|||
|
|
res = engine(img_path)
|
|||
|
|
# res - RapidOCROutput s atributami txts, boxes, scores
|
|||
|
|
entries = []
|
|||
|
|
for txt, box, score in zip(res.txts, res.boxes, res.scores):
|
|||
|
|
entries.append({
|
|||
|
|
"text": txt,
|
|||
|
|
"confidence": float(score),
|
|||
|
|
"bbox": box.tolist() if hasattr(box, "tolist") else box
|
|||
|
|
})
|
|||
|
|
results[img_path.name] = entries
|
|||
|
|
print(f" naydeno {len(entries)} strok")
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 6. Docling - структурный анализ (текстовый слой PDF)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def run_docling(pdf_path: Path, max_pages: int = 10):
|
|||
|
|
if converter is None:
|
|||
|
|
print("[WARN] Docling nedostupen, propuskaem")
|
|||
|
|
return None
|
|||
|
|
import shutil, tempfile, os
|
|||
|
|
# Docling sometimes fails on cyrillic paths -> copy to temp latin path
|
|||
|
|
# Also limit pages to avoid std::bad_alloc on CPU with large PDFs
|
|||
|
|
tmp_dir = tempfile.mkdtemp(prefix="docling_")
|
|||
|
|
tmp_path = Path(tmp_dir) / "input.pdf"
|
|||
|
|
doc = fitz.open(pdf_path)
|
|||
|
|
if len(doc) > max_pages:
|
|||
|
|
new_doc = fitz.open()
|
|||
|
|
for i in range(min(max_pages, len(doc))):
|
|||
|
|
new_doc.insert_pdf(doc, from_page=i, to_page=i)
|
|||
|
|
new_doc.save(tmp_path)
|
|||
|
|
new_doc.close()
|
|||
|
|
else:
|
|||
|
|
shutil.copy2(pdf_path, tmp_path)
|
|||
|
|
doc.close()
|
|||
|
|
print(f" Docling: obrabotka (temp copy, first {max_pages} pages) ...")
|
|||
|
|
try:
|
|||
|
|
result = converter.convert(tmp_path)
|
|||
|
|
doc_out = result.document
|
|||
|
|
md = doc_out.export_to_markdown()
|
|||
|
|
return md
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ERR] Docling: {e}")
|
|||
|
|
return None
|
|||
|
|
finally:
|
|||
|
|
try:
|
|||
|
|
os.remove(tmp_path)
|
|||
|
|
os.rmdir(tmp_dir)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
print(f" Docling: обработка {pdf_path.name} ...")
|
|||
|
|
try:
|
|||
|
|
result = converter.convert(pdf_path)
|
|||
|
|
doc = result.document
|
|||
|
|
md = doc.export_to_markdown()
|
|||
|
|
return md
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ERR] Docling: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 7. Кастомный парсер - простая эвристика для чертежей
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def parse_drawing_text(ocr_data: dict):
|
|||
|
|
"""
|
|||
|
|
Ищет в OCR-результатах признаки этажей, осей, размеров.
|
|||
|
|
Возвращает словарь с найденными сущностями.
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
found = {
|
|||
|
|
"floors": set(),
|
|||
|
|
"axes": set(),
|
|||
|
|
"dimensions": set(),
|
|||
|
|
"rooms": set(),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
floor_patterns = [
|
|||
|
|
re.compile(r"(\d+)[-\s]?й\s*этаж", re.I),
|
|||
|
|
re.compile(r"этаж\s*(\d+)", re.I),
|
|||
|
|
re.compile(r"этаж\s*([А-Я]\d?)", re.I),
|
|||
|
|
re.compile(r"НК\d", re.I),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
axis_patterns = [
|
|||
|
|
re.compile(r"\b([А-Я])\b"), # одиночная буква
|
|||
|
|
re.compile(r"\b(\d{1,2})\b"), # одиночная цифра/двузначное
|
|||
|
|
re.compile(r"([А-Я])\s*[-–]\s*([А-Я])"), # А-Б
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
dim_patterns = [
|
|||
|
|
re.compile(r"\b(\d{3,5})\s*м?[мм]?\b"), # 3600, 5400 и т.д.
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
room_patterns = [
|
|||
|
|
re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+)", re.I),
|
|||
|
|
re.compile(r"([Пп]омещение)\s*(\d+)", re.I),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for img_name, lines in ocr_data.items():
|
|||
|
|
for entry in lines:
|
|||
|
|
text = entry["text"].strip()
|
|||
|
|
# этажи
|
|||
|
|
for pat in floor_patterns:
|
|||
|
|
m = pat.search(text)
|
|||
|
|
if m:
|
|||
|
|
found["floors"].add(m.group(0))
|
|||
|
|
# оси
|
|||
|
|
for pat in axis_patterns:
|
|||
|
|
for m in pat.finditer(text):
|
|||
|
|
g = m.group(0).replace(" ","").replace("-","").replace("–","")
|
|||
|
|
if g and len(g) <= 3:
|
|||
|
|
found["axes"].add(g)
|
|||
|
|
# размеры
|
|||
|
|
for pat in dim_patterns:
|
|||
|
|
for m in pat.finditer(text):
|
|||
|
|
val = m.group(1)
|
|||
|
|
if 100 <= int(val) <= 50000:
|
|||
|
|
found["dimensions"].add(val)
|
|||
|
|
# помещения
|
|||
|
|
for pat in room_patterns:
|
|||
|
|
m = pat.search(text)
|
|||
|
|
if m:
|
|||
|
|
found["rooms"].add(m.group(0))
|
|||
|
|
|
|||
|
|
# очистка ложных осей (слишком общие цифры)
|
|||
|
|
axes_clean = {a for a in found["axes"] if a.isalpha() or (a.isdigit() and 1 <= int(a) <= 50)}
|
|||
|
|
found["axes"] = axes_clean
|
|||
|
|
|
|||
|
|
return {k: sorted(v) for k, v in found.items()}
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# 8. main
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def main():
|
|||
|
|
if not PDF_PATH.exists():
|
|||
|
|
print(f"[ERR] Файл не найден: {PDF_PATH}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
doc_tmp = fitz.open(PDF_PATH)
|
|||
|
|
page_count = doc_tmp.page_count
|
|||
|
|
doc_tmp.close()
|
|||
|
|
print(f"\n=== PDF: {PDF_PATH.name} ===")
|
|||
|
|
print(f"Vsego stranits: {page_count}")
|
|||
|
|
|
|||
|
|
# --- 8.1 Render stranits ---
|
|||
|
|
print("\n[1/4] Render stranits v izobrazheniya...")
|
|||
|
|
images = render_pages(PDF_PATH, OUT_DIR, MAX_PAGES_OCR, DPI)
|
|||
|
|
|
|||
|
|
# --- 8.2 OCR ---
|
|||
|
|
print("\n[2/4] OCR (RapidOCR)...")
|
|||
|
|
ocr_results = run_ocr(images)
|
|||
|
|
ocr_json = OUT_DIR / "ocr_results.json"
|
|||
|
|
with open(ocr_json, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(ocr_results, f, ensure_ascii=False, indent=2)
|
|||
|
|
print(f" -> sohraneno {ocr_json}")
|
|||
|
|
|
|||
|
|
# --- 8.3 Docling ---
|
|||
|
|
print("\n[3/4] Strukturnyy analiz (Docling)...")
|
|||
|
|
md_text = run_docling(PDF_PATH, max_pages=MAX_PAGES_OCR)
|
|||
|
|
if md_text:
|
|||
|
|
md_path = OUT_DIR / "docling_output.md"
|
|||
|
|
with open(md_path, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(md_text)
|
|||
|
|
print(f" -> sohraneno {md_path}")
|
|||
|
|
else:
|
|||
|
|
print(" Docling ne dal rezultata")
|
|||
|
|
|
|||
|
|
# --- 8.4 Custom parser ---
|
|||
|
|
print("\n[4/4] Custom parser chertezha...")
|
|||
|
|
parsed = parse_drawing_text(ocr_results)
|
|||
|
|
parsed_json = OUT_DIR / "parsed_entities.json"
|
|||
|
|
with open(parsed_json, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(parsed, f, ensure_ascii=False, indent=2)
|
|||
|
|
print(f" -> sohraneno {parsed_json}")
|
|||
|
|
print("\n--- Naydennye sushchnosti ---")
|
|||
|
|
for k, v in parsed.items():
|
|||
|
|
print(f" {k}: {v}")
|
|||
|
|
|
|||
|
|
print(f"\n=== Gotovo. Rezultaty v {OUT_DIR} ===")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|