opencode/process_pdf_full.py
Кирилл Блинов c756a5766b Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools
- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API
- Add rag_query.py: query the knowledge graph
- Add vlm_describer.py: generate VLM descriptions via LM Studio
- Add test_model.py: quick check for LightRAG-compatible models
- Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline
- Fix rapidocr import (rapidocr_onnxruntime)
- Fix process_any_pdf.py paths for cross-platform use
- Add .env.example, README_RAG.md, AGENTS.md
- Update .gitignore for outputs and secrets
2026-05-29 09:54:37 +03:00

102 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Полное распознавание PDF без Docling (из-за нехватки RAM на CPU).
Стек: PyMuPDF (рендер + текстовый слой) + RapidOCR (текст+координаты).
"""
import os
import sys
import json
import fitz
from pathlib import Path
from rapidocr_onnxruntime import RapidOCR
# ------------------------------------------------------------------
# 1. Конфигурация
# ------------------------------------------------------------------
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
OUT_DIR = Path(r"D:\TEST docs\output")
OUT_DIR.mkdir(exist_ok=True)
DPI = 300 # разрешение рендера
BATCH_SIZE = 5 # сохраняем прогресс каждые N страниц
# ------------------------------------------------------------------
# 2. Рендер + OCR одной страницы
# ------------------------------------------------------------------
engine = RapidOCR()
def process_page(doc: fitz.Document, page_num: int, dpi: int = 300):
"""Обработка одной страницы: рендер, OCR, текстовый слой."""
page = doc.load_page(page_num)
# --- 2.1 Текстовый слой PDF (если есть) ---
raw_text = page.get_text("text").strip()
# --- 2.2 Рендер в PNG ---
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_path = OUT_DIR / f"page_{page_num+1:03d}.png"
pix.save(img_path)
# --- 2.3 OCR ---
res = engine(img_path)
ocr_lines = []
if res and res[0] is not None:
for item in res[0]:
box, txt, score = item
ocr_lines.append({
"text": txt,
"confidence": float(score),
"bbox": box
})
return {
"page_number": page_num + 1,
"image": str(img_path.name),
"pdf_text_layer": raw_text,
"ocr_lines": ocr_lines,
"ocr_line_count": len(ocr_lines)
}
# ------------------------------------------------------------------
# 3. main
# ------------------------------------------------------------------
def main():
if not PDF_PATH.exists():
print(f"[ERR] Файл не найден: {PDF_PATH}")
sys.exit(1)
doc = fitz.open(PDF_PATH)
total = len(doc)
print(f"=== PDF: {PDF_PATH.name} | Страниц: {total} ===\n")
all_pages = []
for i in range(total):
print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")
page_data = process_page(doc, i, DPI)
all_pages.append(page_data)
print(f"OCR строк: {page_data['ocr_line_count']}")
# Промежуточное сохранение
if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
part_path = OUT_DIR / "full_ocr_results.json"
with open(part_path, "w", encoding="utf-8") as f:
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
print(f" -> промежуточное сохранение ({i+1} страниц)")
doc.close()
# Итоговый файл
final_path = OUT_DIR / "full_ocr_results.json"
with open(final_path, "w", encoding="utf-8") as f:
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
print(f"\n=== Готово. Обработано {len(all_pages)} страниц ===")
print(f"Результат: {final_path}")
if __name__ == "__main__":
main()