#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Универсальное распознавание PDF в указанную папку. Поддерживает: - RapidOCR (локально, быстро) - RapidOCR + tiling (для больших чертежей) - qwen-vl-ocr (API, точнее) Использование: python process_any_pdf.py [--use-qwen] [--use-tiling] """ import sys import json import re import fitz from pathlib import Path from PIL import Image from rapidocr_onnxruntime import RapidOCR # ------------------------------------------------------------------ # Параметры # ------------------------------------------------------------------ DPI = 300 BATCH_SIZE = 5 TILE_SIZE = 2000 TILE_OVERLAP = 200 engine = RapidOCR() # qwen-vl-ocr lazy import try: from ocr_qwen import run_ocr as qwen_ocr QWEN_AVAILABLE = True except ImportError: QWEN_AVAILABLE = False # ------------------------------------------------------------------ # Tiling OCR helpers # ------------------------------------------------------------------ def _make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200): w, h = img.size tiles = [] step = tile_size - overlap for y in range(0, h, step): for x in range(0, w, step): x2 = min(x + tile_size, w) y2 = min(y + tile_size, h) tiles.append((x, y, img.crop((x, y, x2, y2)))) return tiles def _bbox_iou(a, b): def _rect(box): if isinstance(box[0], list): xs = [p[0] for p in box] ys = [p[1] for p in box] return min(xs), min(ys), max(xs), max(ys) return box[0], box[1], box[2], box[3] ax1, ay1, ax2, ay2 = _rect(a) bx1, by1, bx2, by2 = _rect(b) ix1, iy1 = max(ax1, bx1), max(ay1, by1) ix2, iy2 = min(ax2, bx2), min(ay2, by2) if ix2 <= ix1 or iy2 <= iy1: return 0.0 inter = (ix2 - ix1) * (iy2 - iy1) union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter return inter / union if union > 0 else 0.0 def run_tiling_ocr(img_path: Path, conf_threshold: float = 0.5): """Запускает RapidOCR по кропам и объединяет результаты.""" img = Image.open(img_path) tiles = _make_tiles(img, TILE_SIZE, TILE_OVERLAP) all_results = [] for off_x, off_y, crop in tiles: tmp = f"/tmp/tile_ocr.png" crop.save(tmp) res = engine(tmp) if res and res[0]: for item in res[0]: box, txt, score = item if score < conf_threshold: continue shifted = [[pt[0] + off_x, pt[1] + off_y] for pt in box] all_results.append({"text": txt, "confidence": float(score), "bbox": shifted}) # Дедупликация по IoU unique = [] for r in sorted(all_results, key=lambda x: -x["confidence"]): is_dup = any(_bbox_iou(r["bbox"], u["bbox"]) > 0.5 for u in unique) if not is_dup: unique.append(r) return unique # ------------------------------------------------------------------ def process_pdf(pdf_path: Path, out_dir: Path, use_qwen: bool = False, use_tiling: bool = False): out_dir.mkdir(parents=True, exist_ok=True) doc = fitz.open(pdf_path) total = len(doc) print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===") if use_qwen: print(f"[INFO] OCR engine: qwen-vl-ocr (API)") elif use_tiling: print(f"[INFO] OCR engine: RapidOCR + tiling ({TILE_SIZE}px tiles)") else: print(f"[INFO] OCR engine: RapidOCR (local)") print() all_pages = [] for i in range(total): print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ", flush=True) page = doc.load_page(i) raw_text = page.get_text("text").strip() mat = fitz.Matrix(DPI / 72, DPI / 72) pix = page.get_pixmap(matrix=mat) img_path = out_dir / f"page_{i+1:03d}.png" pix.save(img_path) # Выбор OCR engine if use_qwen and QWEN_AVAILABLE: try: ocr_lines = qwen_ocr(img_path, verbose=False) print(f"qwen-ocr строк: {len(ocr_lines)}") except Exception as e: print(f"qwen-ocr ERR: {e}, fallback to RapidOCR") ocr_lines = _run_rapidocr(img_path) print(f"RapidOCR строк: {len(ocr_lines)}") elif use_tiling: ocr_lines = run_tiling_ocr(img_path) print(f"Tiling OCR строк: {len(ocr_lines)}") else: ocr_lines = _run_rapidocr(img_path) print(f"RapidOCR строк: {len(ocr_lines)}") all_pages.append({ "page_number": i + 1, "image": str(img_path.name), "pdf_text_layer": raw_text, "ocr_lines": ocr_lines, "ocr_line_count": len(ocr_lines) }) if (i + 1) % BATCH_SIZE == 0 or i == total - 1: with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f: json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2) print(f" -> сохранено ({i+1} страниц)") doc.close() print(f"\n=== Готово. Результат в {out_dir} ===") def _run_rapidocr(img_path: Path): res = engine(img_path) ocr_lines = [] if res and res[0] is not None: for item in res[0]: box, txt, score = item ocr_lines.append({ "text": txt, "confidence": float(score), "bbox": box }) return ocr_lines # ------------------------------------------------------------------ def main(): use_qwen = "--use-qwen" in sys.argv use_tiling = "--use-tiling" in sys.argv if use_qwen: sys.argv.remove("--use-qwen") if use_tiling: sys.argv.remove("--use-tiling") if len(sys.argv) < 2: pdf_file = "123.pdf" out_name = "output_123" else: pdf_file = sys.argv[1] out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}" pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file out_dir = Path(out_name) if not pdf_path.exists(): print(f"[ERR] Файл не найден: {pdf_path}") sys.exit(1) process_pdf(pdf_path, out_dir, use_qwen=use_qwen, use_tiling=use_tiling) if __name__ == "__main__": main()