opencode/process_pdf_full.py

102 lines
3.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Полное распознавание PDF без Docling (из-за нехватки RAM на CPU).
Стек: PyMuPDF (рендер + текстовый слой) + RapidOCR (текст+координаты).
"""
import os
import sys
import json
import fitz
from pathlib import Path
from rapidocr import RapidOCR
# ------------------------------------------------------------------
# 1. Конфигурация
# ------------------------------------------------------------------
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
OUT_DIR = Path(r"D:\TEST docs\output")
OUT_DIR.mkdir(exist_ok=True)
DPI = 300 # разрешение рендера
BATCH_SIZE = 5 # сохраняем прогресс каждые N страниц
# ------------------------------------------------------------------
# 2. Рендер + OCR одной страницы
# ------------------------------------------------------------------
engine = RapidOCR()
def process_page(doc: fitz.Document, page_num: int, dpi: int = 300):
"""Обработка одной страницы: рендер, OCR, текстовый слой."""
page = doc.load_page(page_num)
# --- 2.1 Текстовый слой PDF (если есть) ---
raw_text = page.get_text("text").strip()
# --- 2.2 Рендер в PNG ---
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_path = OUT_DIR / f"page_{page_num+1:03d}.png"
pix.save(img_path)
# --- 2.3 OCR ---
res = engine(img_path)
ocr_lines = []
if res and res[0] is not None:
for item in res[0]:
box, txt, score = item
ocr_lines.append({
"text": txt,
"confidence": float(score),
"bbox": box
})
return {
"page_number": page_num + 1,
"image": str(img_path.name),
"pdf_text_layer": raw_text,
"ocr_lines": ocr_lines,
"ocr_line_count": len(ocr_lines)
}
# ------------------------------------------------------------------
# 3. main
# ------------------------------------------------------------------
def main():
if not PDF_PATH.exists():
print(f"[ERR] Файл не найден: {PDF_PATH}")
sys.exit(1)
doc = fitz.open(PDF_PATH)
total = len(doc)
print(f"=== PDF: {PDF_PATH.name} | Страниц: {total} ===\n")
all_pages = []
for i in range(total):
print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")
page_data = process_page(doc, i, DPI)
all_pages.append(page_data)
print(f"OCR строк: {page_data['ocr_line_count']}")
# Промежуточное сохранение
if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
part_path = OUT_DIR / "full_ocr_results.json"
with open(part_path, "w", encoding="utf-8") as f:
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
print(f" -> промежуточное сохранение ({i+1} страниц)")
doc.close()
# Итоговый файл
final_path = OUT_DIR / "full_ocr_results.json"
with open(final_path, "w", encoding="utf-8") as f:
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
print(f"\n=== Готово. Обработано {len(all_pages)} страниц ===")
print(f"Результат: {final_path}")
if __name__ == "__main__":
main()