#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Универсальное распознавание PDF в указанную папку. Использование: python process_any_pdf.py """ import sys import json import fitz from pathlib import Path from rapidocr import RapidOCR # ------------------------------------------------------------------ # Параметры # ------------------------------------------------------------------ DPI = 300 BATCH_SIZE = 5 engine = RapidOCR() # ------------------------------------------------------------------ def process_pdf(pdf_path: Path, out_dir: Path): out_dir.mkdir(parents=True, exist_ok=True) doc = fitz.open(pdf_path) total = len(doc) print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===\n") all_pages = [] for i in range(total): print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ") page = doc.load_page(i) raw_text = page.get_text("text").strip() mat = fitz.Matrix(DPI / 72, DPI / 72) pix = page.get_pixmap(matrix=mat) img_path = out_dir / f"page_{i+1:03d}.png" pix.save(img_path) res = engine(img_path) ocr_lines = [] if res and res.txts is not None: for txt, box, score in zip(res.txts, res.boxes, res.scores): ocr_lines.append({ "text": txt, "confidence": float(score), "bbox": box.tolist() if hasattr(box, "tolist") else box }) all_pages.append({ "page_number": i + 1, "image": str(img_path.name), "pdf_text_layer": raw_text, "ocr_lines": ocr_lines, "ocr_line_count": len(ocr_lines) }) print(f"OCR строк: {len(ocr_lines)}") if (i + 1) % BATCH_SIZE == 0 or i == total - 1: with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f: json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2) print(f" -> промежуточное сохранение ({i+1} страниц)") doc.close() print(f"\n=== Готово. Результат в {out_dir} ===") # ------------------------------------------------------------------ def main(): if len(sys.argv) < 2: pdf_file = "123.pdf" out_name = "output_123" else: pdf_file = sys.argv[1] out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}" pdf_path = Path(r"D:\TEST docs") / pdf_file out_dir = Path(r"D:\TEST docs") / out_name if not pdf_path.exists(): print(f"[ERR] Файл не найден: {pdf_path}") sys.exit(1) process_pdf(pdf_path, out_dir) if __name__ == "__main__": main()