2026-05-28 22:04:01 +00:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
Полное распознавание PDF без Docling (из-за нехватки RAM на CPU).
|
|
|
|
|
|
Стек: PyMuPDF (рендер + текстовый слой) + RapidOCR (текст+координаты).
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import json
|
|
|
|
|
|
import fitz
|
|
|
|
|
|
from pathlib import Path
|
2026-06-01 09:49:29 +00:00
|
|
|
|
from rapidocr import RapidOCR
|
2026-05-28 22:04:01 +00:00
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# 1. Конфигурация
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
|
|
|
|
|
|
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
|
|
|
|
|
|
OUT_DIR = Path(r"D:\TEST docs\output")
|
|
|
|
|
|
OUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
DPI = 300 # разрешение рендера
|
|
|
|
|
|
BATCH_SIZE = 5 # сохраняем прогресс каждые N страниц
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# 2. Рендер + OCR одной страницы
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
engine = RapidOCR()
|
|
|
|
|
|
|
|
|
|
|
|
def process_page(doc: fitz.Document, page_num: int, dpi: int = 300):
|
|
|
|
|
|
"""Обработка одной страницы: рендер, OCR, текстовый слой."""
|
|
|
|
|
|
page = doc.load_page(page_num)
|
|
|
|
|
|
|
|
|
|
|
|
# --- 2.1 Текстовый слой PDF (если есть) ---
|
|
|
|
|
|
raw_text = page.get_text("text").strip()
|
|
|
|
|
|
|
|
|
|
|
|
# --- 2.2 Рендер в PNG ---
|
|
|
|
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
|
|
|
pix = page.get_pixmap(matrix=mat)
|
|
|
|
|
|
img_path = OUT_DIR / f"page_{page_num+1:03d}.png"
|
|
|
|
|
|
pix.save(img_path)
|
|
|
|
|
|
|
|
|
|
|
|
# --- 2.3 OCR ---
|
|
|
|
|
|
res = engine(img_path)
|
|
|
|
|
|
ocr_lines = []
|
2026-05-29 06:54:37 +00:00
|
|
|
|
if res and res[0] is not None:
|
|
|
|
|
|
for item in res[0]:
|
|
|
|
|
|
box, txt, score = item
|
|
|
|
|
|
ocr_lines.append({
|
|
|
|
|
|
"text": txt,
|
|
|
|
|
|
"confidence": float(score),
|
|
|
|
|
|
"bbox": box
|
|
|
|
|
|
})
|
2026-05-28 22:04:01 +00:00
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"page_number": page_num + 1,
|
|
|
|
|
|
"image": str(img_path.name),
|
|
|
|
|
|
"pdf_text_layer": raw_text,
|
|
|
|
|
|
"ocr_lines": ocr_lines,
|
|
|
|
|
|
"ocr_line_count": len(ocr_lines)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# 3. main
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
|
|
|
|
if not PDF_PATH.exists():
|
|
|
|
|
|
print(f"[ERR] Файл не найден: {PDF_PATH}")
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
doc = fitz.open(PDF_PATH)
|
|
|
|
|
|
total = len(doc)
|
|
|
|
|
|
print(f"=== PDF: {PDF_PATH.name} | Страниц: {total} ===\n")
|
|
|
|
|
|
|
|
|
|
|
|
all_pages = []
|
|
|
|
|
|
for i in range(total):
|
|
|
|
|
|
print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")
|
|
|
|
|
|
page_data = process_page(doc, i, DPI)
|
|
|
|
|
|
all_pages.append(page_data)
|
|
|
|
|
|
print(f"OCR строк: {page_data['ocr_line_count']}")
|
|
|
|
|
|
|
|
|
|
|
|
# Промежуточное сохранение
|
|
|
|
|
|
if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
|
|
|
|
|
|
part_path = OUT_DIR / "full_ocr_results.json"
|
|
|
|
|
|
with open(part_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
print(f" -> промежуточное сохранение ({i+1} страниц)")
|
|
|
|
|
|
|
|
|
|
|
|
doc.close()
|
|
|
|
|
|
|
|
|
|
|
|
# Итоговый файл
|
|
|
|
|
|
final_path = OUT_DIR / "full_ocr_results.json"
|
|
|
|
|
|
with open(final_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n=== Готово. Обработано {len(all_pages)} страниц ===")
|
|
|
|
|
|
print(f"Результат: {final_path}")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|