opencode/process_pdf.py

259 lines
9.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Тестовый скрипт распознавания PDF (чертежи / документы)
Стек: PyMuPDF (рендер) + RapidOCR (текст+координаты) + Docling (структура)
Работает на CPU.
"""
import os
import sys
import json
import fitz # PyMuPDF
from pathlib import Path
# ------------------------------------------------------------------
# 1. RapidOCR - лёгкий ONNX-OCR (входит в зависимости Docling)
# ------------------------------------------------------------------
try:
from rapidocr import RapidOCR
engine = RapidOCR()
print("[OK] RapidOCR загружен")
except Exception as e:
print(f"[ERR] RapidOCR: {e}")
engine = None
# ------------------------------------------------------------------
# 2. Docling - структурный анализ PDF
# ------------------------------------------------------------------
try:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
print("[OK] Docling загружен")
except Exception as e:
print(f"[ERR] Docling: {e}")
converter = None
# ------------------------------------------------------------------
# 3. Конфигурация
# ------------------------------------------------------------------
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
OUT_DIR = Path(r"D:\TEST docs\output")
OUT_DIR.mkdir(exist_ok=True)
MAX_PAGES_OCR = 10 # test mode: first 10 pages
DPI = 300 # разрешение рендера страниц
# ------------------------------------------------------------------
# 4. Рендер страниц PDF -> PNG (PyMuPDF)
# ------------------------------------------------------------------
def render_pages(pdf_path: Path, out_dir: Path, max_pages: int = None, dpi: int = 300):
doc = fitz.open(pdf_path)
images = []
total = min(max_pages, len(doc)) if max_pages else len(doc)
for i in range(total):
page = doc.load_page(i)
# матрица для заданного DPI (72 dpi по умолчанию в PDF)
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_path = out_dir / f"page_{i+1:03d}.png"
pix.save(img_path)
images.append(img_path)
print(f" -> страница {i+1} отрендерена ({img_path.name})")
doc.close()
return images
# ------------------------------------------------------------------
# 5. OCR - извлечение текста с координатами (RapidOCR)
# ------------------------------------------------------------------
def run_ocr(image_paths: list[Path]):
if engine is None:
print("[WARN] OCR-dvigok nedostupen, propuskaem")
return {}
results = {}
for img_path in image_paths:
print(f" OCR: {img_path.name} ...")
res = engine(img_path)
# res - RapidOCROutput s atributami txts, boxes, scores
entries = []
for txt, box, score in zip(res.txts, res.boxes, res.scores):
entries.append({
"text": txt,
"confidence": float(score),
"bbox": box.tolist() if hasattr(box, "tolist") else box
})
results[img_path.name] = entries
print(f" naydeno {len(entries)} strok")
return results
# ------------------------------------------------------------------
# 6. Docling - структурный анализ (текстовый слой PDF)
# ------------------------------------------------------------------
def run_docling(pdf_path: Path, max_pages: int = 10):
if converter is None:
print("[WARN] Docling nedostupen, propuskaem")
return None
import shutil, tempfile, os
# Docling sometimes fails on cyrillic paths -> copy to temp latin path
# Also limit pages to avoid std::bad_alloc on CPU with large PDFs
tmp_dir = tempfile.mkdtemp(prefix="docling_")
tmp_path = Path(tmp_dir) / "input.pdf"
doc = fitz.open(pdf_path)
if len(doc) > max_pages:
new_doc = fitz.open()
for i in range(min(max_pages, len(doc))):
new_doc.insert_pdf(doc, from_page=i, to_page=i)
new_doc.save(tmp_path)
new_doc.close()
else:
shutil.copy2(pdf_path, tmp_path)
doc.close()
print(f" Docling: obrabotka (temp copy, first {max_pages} pages) ...")
try:
result = converter.convert(tmp_path)
doc_out = result.document
md = doc_out.export_to_markdown()
return md
except Exception as e:
print(f"[ERR] Docling: {e}")
return None
finally:
try:
os.remove(tmp_path)
os.rmdir(tmp_dir)
except Exception:
pass
print(f" Docling: обработка {pdf_path.name} ...")
try:
result = converter.convert(pdf_path)
doc = result.document
md = doc.export_to_markdown()
return md
except Exception as e:
print(f"[ERR] Docling: {e}")
return None
# ------------------------------------------------------------------
# 7. Кастомный парсер - простая эвристика для чертежей
# ------------------------------------------------------------------
def parse_drawing_text(ocr_data: dict):
"""
Ищет в OCR-результатах признаки этажей, осей, размеров.
Возвращает словарь с найденными сущностями.
"""
import re
found = {
"floors": set(),
"axes": set(),
"dimensions": set(),
"rooms": set(),
}
floor_patterns = [
re.compile(r"(\d+)[-\s]?й\s*этаж", re.I),
re.compile(r"этаж\s*(\d+)", re.I),
re.compile(r"этаж\s*([А-Я]\d?)", re.I),
re.compile(r"НК\d", re.I),
]
axis_patterns = [
re.compile(r"\b([А-Я])\b"), # одиночная буква
re.compile(r"\b(\d{1,2})\b"), # одиночная цифра/двузначное
re.compile(r"([А-Я])\s*[-]\s*([А-Я])"), # А
]
dim_patterns = [
re.compile(r"\b(\d{3,5})\s*м?[мм]?\b"), # 3600, 5400 и т.д.
]
room_patterns = [
re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+)", re.I),
re.compile(r"([Пп]омещение)\s*(\d+)", re.I),
]
for img_name, lines in ocr_data.items():
for entry in lines:
text = entry["text"].strip()
# этажи
for pat in floor_patterns:
m = pat.search(text)
if m:
found["floors"].add(m.group(0))
# оси
for pat in axis_patterns:
for m in pat.finditer(text):
g = m.group(0).replace(" ","").replace("-","").replace("","")
if g and len(g) <= 3:
found["axes"].add(g)
# размеры
for pat in dim_patterns:
for m in pat.finditer(text):
val = m.group(1)
if 100 <= int(val) <= 50000:
found["dimensions"].add(val)
# помещения
for pat in room_patterns:
m = pat.search(text)
if m:
found["rooms"].add(m.group(0))
# очистка ложных осей (слишком общие цифры)
axes_clean = {a for a in found["axes"] if a.isalpha() or (a.isdigit() and 1 <= int(a) <= 50)}
found["axes"] = axes_clean
return {k: sorted(v) for k, v in found.items()}
# ------------------------------------------------------------------
# 8. main
# ------------------------------------------------------------------
def main():
if not PDF_PATH.exists():
print(f"[ERR] Файл не найден: {PDF_PATH}")
sys.exit(1)
doc_tmp = fitz.open(PDF_PATH)
page_count = doc_tmp.page_count
doc_tmp.close()
print(f"\n=== PDF: {PDF_PATH.name} ===")
print(f"Vsego stranits: {page_count}")
# --- 8.1 Render stranits ---
print("\n[1/4] Render stranits v izobrazheniya...")
images = render_pages(PDF_PATH, OUT_DIR, MAX_PAGES_OCR, DPI)
# --- 8.2 OCR ---
print("\n[2/4] OCR (RapidOCR)...")
ocr_results = run_ocr(images)
ocr_json = OUT_DIR / "ocr_results.json"
with open(ocr_json, "w", encoding="utf-8") as f:
json.dump(ocr_results, f, ensure_ascii=False, indent=2)
print(f" -> sohraneno {ocr_json}")
# --- 8.3 Docling ---
print("\n[3/4] Strukturnyy analiz (Docling)...")
md_text = run_docling(PDF_PATH, max_pages=MAX_PAGES_OCR)
if md_text:
md_path = OUT_DIR / "docling_output.md"
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_text)
print(f" -> sohraneno {md_path}")
else:
print(" Docling ne dal rezultata")
# --- 8.4 Custom parser ---
print("\n[4/4] Custom parser chertezha...")
parsed = parse_drawing_text(ocr_results)
parsed_json = OUT_DIR / "parsed_entities.json"
with open(parsed_json, "w", encoding="utf-8") as f:
json.dump(parsed, f, ensure_ascii=False, indent=2)
print(f" -> sohraneno {parsed_json}")
print("\n--- Naydennye sushchnosti ---")
for k, v in parsed.items():
print(f" {k}: {v}")
print(f"\n=== Gotovo. Rezultaty v {OUT_DIR} ===")
if __name__ == "__main__":
main()