opencode/process_any_pdf.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Универсальное распознавание PDF в указанную папку.
Использование:
    python process_any_pdf.py <pdf_file> <output_folder_name>
"""

import sys
import json
import fitz
from pathlib import Path
from rapidocr_onnxruntime import RapidOCR

# ------------------------------------------------------------------
# Параметры
# ------------------------------------------------------------------
DPI = 300
BATCH_SIZE = 5

engine = RapidOCR()

# ------------------------------------------------------------------
def process_pdf(pdf_path: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    doc = fitz.open(pdf_path)
    total = len(doc)
    print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===\n")
    
    all_pages = []
    for i in range(total):
        print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")
        page = doc.load_page(i)
        raw_text = page.get_text("text").strip()
        
        mat = fitz.Matrix(DPI / 72, DPI / 72)
        pix = page.get_pixmap(matrix=mat)
        img_path = out_dir / f"page_{i+1:03d}.png"
        pix.save(img_path)
        
        res = engine(img_path)
        ocr_lines = []
        if res and res[0] is not None:
            for item in res[0]:
                box, txt, score = item
                ocr_lines.append({
                    "text": txt,
                    "confidence": float(score),
                    "bbox": box
                })
        
        all_pages.append({
            "page_number": i + 1,
            "image": str(img_path.name),
            "pdf_text_layer": raw_text,
            "ocr_lines": ocr_lines,
            "ocr_line_count": len(ocr_lines)
        })
        print(f"OCR строк: {len(ocr_lines)}")
        
        if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
            with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:
                json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
            print(f"  -> промежуточное сохранение ({i+1} страниц)")
    
    doc.close()
    print(f"\n=== Готово. Результат в {out_dir} ===")

# ------------------------------------------------------------------
def main():
    if len(sys.argv) < 2:
        pdf_file = "123.pdf"
        out_name = "output_123"
    else:
        pdf_file = sys.argv[1]
        out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"
    
    pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file
    out_dir = Path(out_name)
    
    if not pdf_path.exists():
        print(f"[ERR] Файл не найден: {pdf_path}")
        sys.exit(1)
    
    process_pdf(pdf_path, out_dir)

if __name__ == "__main__":
    main()
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`"""`
			`Универсальное распознавание PDF в указанную папку.`
			`Использование:`
			`python process_any_pdf.py <pdf_file> <output_folder_name>`
			`"""`

			`import sys`
			`import json`
			`import fitz`
			`from pathlib import Path`
Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`from rapidocr_onnxruntime import RapidOCR`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`# ------------------------------------------------------------------`
			`# Параметры`
			`# ------------------------------------------------------------------`
			`DPI = 300`
			`BATCH_SIZE = 5`

			`engine = RapidOCR()`

			`# ------------------------------------------------------------------`
			`def process_pdf(pdf_path: Path, out_dir: Path):`
			`out_dir.mkdir(parents=True, exist_ok=True)`
			`doc = fitz.open(pdf_path)`
			`total = len(doc)`
			`print(f"=== PDF: {pdf_path.name} \| Страниц: {total} -> {out_dir} ===\n")`

			`all_pages = []`
			`for i in range(total):`
			`print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ")`
			`page = doc.load_page(i)`
			`raw_text = page.get_text("text").strip()`

			`mat = fitz.Matrix(DPI / 72, DPI / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_path = out_dir / f"page_{i+1:03d}.png"`
			`pix.save(img_path)`

			`res = engine(img_path)`
			`ocr_lines = []`
Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`if res and res[0] is not None:`
			`for item in res[0]:`
			`box, txt, score = item`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`ocr_lines.append({`
			`"text": txt,`
			`"confidence": float(score),`
Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`"bbox": box`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`})`

			`all_pages.append({`
			`"page_number": i + 1,`
			`"image": str(img_path.name),`
			`"pdf_text_layer": raw_text,`
			`"ocr_lines": ocr_lines,`
			`"ocr_line_count": len(ocr_lines)`
			`})`
			`print(f"OCR строк: {len(ocr_lines)}")`

			`if (i + 1) % BATCH_SIZE == 0 or i == total - 1:`
			`with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:`
			`json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)`
			`print(f" -> промежуточное сохранение ({i+1} страниц)")`

			`doc.close()`
			`print(f"\n=== Готово. Результат в {out_dir} ===")`

			`# ------------------------------------------------------------------`
			`def main():`
			`if len(sys.argv) < 2:`
			`pdf_file = "123.pdf"`
			`out_name = "output_123"`
			`else:`
			`pdf_file = sys.argv[1]`
			`out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"`

Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file`
			`out_dir = Path(out_name)`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`if not pdf_path.exists():`
			`print(f"[ERR] Файл не найден: {pdf_path}")`
			`sys.exit(1)`

			`process_pdf(pdf_path, out_dir)`

			`if __name__ == "__main__":`
			`main()`