- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets
39 lines
1.1 KiB
Bash
Executable File
39 lines
1.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# run_pipeline.sh — полный pipeline: OCR → VLM → RAG индекс
|
|
|
|
set -e
|
|
|
|
PDF="${1:-123.pdf}"
|
|
OUT="${2:-output_123}"
|
|
PYTHON="/opt/homebrew/bin/python3.11"
|
|
|
|
echo "=============================================="
|
|
echo " Pipeline: $PDF → $OUT"
|
|
echo "=============================================="
|
|
|
|
# 1. OCR
|
|
echo ""
|
|
echo "[1/4] OCR: PDF → PNG + JSON"
|
|
$PYTHON process_any_pdf.py "$PDF" "$OUT"
|
|
|
|
# 2. VLM descriptions
|
|
echo ""
|
|
echo "[2/4] VLM: PNG → описания (LM Studio)"
|
|
$PYTHON vlm_describer.py "$OUT" --model qwen/qwen3-vl-4b
|
|
|
|
# 3. RAG Index
|
|
echo ""
|
|
echo "[3/4] RAG индекс: JSON + VLM → граф знаний"
|
|
rm -rf "$OUT/lightrag_cache"
|
|
$PYTHON rag_indexer.py "$OUT" --backend opencode --model nemotron-3-super-free --vlm-desc
|
|
|
|
# 4. Test query
|
|
echo ""
|
|
echo "[4/4] Тестовый запрос"
|
|
$PYTHON rag_query.py "$OUT" "Какие оси есть в чертеже?" --backend opencode --mode hybrid
|
|
|
|
echo ""
|
|
echo "=============================================="
|
|
echo " Готово! Индекс в $OUT/lightrag_cache"
|
|
echo "=============================================="
|