- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets
89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Универсальное распознавание PDF в указанную папку.
|
|
Использование:
|
|
python process_any_pdf.py <pdf_file> <output_folder_name>
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import fitz
|
|
from pathlib import Path
|
|
from rapidocr_onnxruntime import RapidOCR
|
|
|
|
# ------------------------------------------------------------------
|
|
# Параметры
|
|
# ------------------------------------------------------------------
|
|
DPI = 300
|
|
BATCH_SIZE = 5
|
|
|
|
engine = RapidOCR()
|
|
|
|
# ------------------------------------------------------------------
|
|
def process_pdf(pdf_path: Path, out_dir: Path):
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
doc = fitz.open(pdf_path)
|
|
total = len(doc)
|
|
print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===\n")
|
|
|
|
all_pages = []
|
|
for i in range(total):
|
|
print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")
|
|
page = doc.load_page(i)
|
|
raw_text = page.get_text("text").strip()
|
|
|
|
mat = fitz.Matrix(DPI / 72, DPI / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_path = out_dir / f"page_{i+1:03d}.png"
|
|
pix.save(img_path)
|
|
|
|
res = engine(img_path)
|
|
ocr_lines = []
|
|
if res and res[0] is not None:
|
|
for item in res[0]:
|
|
box, txt, score = item
|
|
ocr_lines.append({
|
|
"text": txt,
|
|
"confidence": float(score),
|
|
"bbox": box
|
|
})
|
|
|
|
all_pages.append({
|
|
"page_number": i + 1,
|
|
"image": str(img_path.name),
|
|
"pdf_text_layer": raw_text,
|
|
"ocr_lines": ocr_lines,
|
|
"ocr_line_count": len(ocr_lines)
|
|
})
|
|
print(f"OCR строк: {len(ocr_lines)}")
|
|
|
|
if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
|
|
with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:
|
|
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
|
|
print(f" -> промежуточное сохранение ({i+1} страниц)")
|
|
|
|
doc.close()
|
|
print(f"\n=== Готово. Результат в {out_dir} ===")
|
|
|
|
# ------------------------------------------------------------------
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
pdf_file = "123.pdf"
|
|
out_name = "output_123"
|
|
else:
|
|
pdf_file = sys.argv[1]
|
|
out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"
|
|
|
|
pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file
|
|
out_dir = Path(out_name)
|
|
|
|
if not pdf_path.exists():
|
|
print(f"[ERR] Файл не найден: {pdf_path}")
|
|
sys.exit(1)
|
|
|
|
process_pdf(pdf_path, out_dir)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|