opencode/rag_indexer.py
Кирилл Блинов c756a5766b Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools
- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API
- Add rag_query.py: query the knowledge graph
- Add vlm_describer.py: generate VLM descriptions via LM Studio
- Add test_model.py: quick check for LightRAG-compatible models
- Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline
- Fix rapidocr import (rapidocr_onnxruntime)
- Fix process_any_pdf.py paths for cross-platform use
- Add .env.example, README_RAG.md, AGENTS.md
- Update .gitignore for outputs and secrets
2026-05-29 09:54:37 +03:00

353 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Построение LightRAG индекса из OCR-результатов чертежей.
Использование:
# С OpenAI (требуется OPENAI_API_KEY)
python rag_indexer.py <output_folder> --backend openai
# С локальной моделью через Ollama (требуется запущенный Ollama)
python rag_indexer.py <output_folder> --backend ollama --model qwen2.5:14b
# Через LM Studio (локальный OpenAI-compatible сервер)
python rag_indexer.py <output_folder> --backend lmstudio --model qwen2.5:14b
# С VLM-описаниями (предварительно запустить vlm_describer.py)
python rag_indexer.py <output_folder> --backend lmstudio --vlm-desc
Результат: папка <output_folder>/lightrag_cache с графом знаний.
"""
import os
import sys
import json
import asyncio
import argparse
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
# ------------------------------------------------------------------
# LightRAG imports
# ------------------------------------------------------------------
try:
from lightrag import LightRAG, QueryParam
from lightrag.utils import EmbeddingFunc
except ImportError as e:
print(f"[ERR] LightRAG не установлен: {e}")
print(" Установите: pip install lightrag-hku")
sys.exit(1)
# ------------------------------------------------------------------
# LLM backends
# ------------------------------------------------------------------
def get_openai_backend(model: str = "gpt-4o-mini"):
"""OpenAI backend (требует OPENAI_API_KEY)."""
from lightrag.llm import openai_complete_if_cache
from lightrag.llm import openai_embedding
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return await openai_complete_if_cache(
model, prompt,
system_prompt=system_prompt,
history_messages=history_messages,
**kwargs
)
async def embed_func(texts: list[str]) -> list[list[float]]:
return await openai_embedding(texts, model="text-embedding-3-small")
embed_cfg = EmbeddingFunc(
embedding_dim=1536,
max_token_size=8192,
func=embed_func,
)
return llm_func, embed_cfg
def get_ollama_backend(model: str = "qwen2.5:14b", embed_model: str = "nomic-embed-text"):
"""Ollama backend (требует запущенный ollama serve)."""
from lightrag.llm import ollama_model_complete, ollama_embedding
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return await ollama_model_complete(
model, prompt,
system_prompt=system_prompt,
history_messages=history_messages,
**kwargs
)
async def embed_func(texts: list[str]) -> list[list[float]]:
return await ollama_embedding(texts, model=embed_model)
# nomic-embed-text -> 768 dim, check your model
embed_cfg = EmbeddingFunc(
embedding_dim=768,
max_token_size=8192,
func=embed_func,
)
return llm_func, embed_cfg
def get_lmstudio_backend(model: str = "qwen2.5:14b"):
"""LM Studio backend (OpenAI-compatible API на http://127.0.0.1:1234/v1)."""
from lightrag.llm import openai_complete_if_cache
from sentence_transformers import SentenceTransformer
base_url = os.environ.get("LMSTUDIO_URL", "http://127.0.0.1:1234/v1")
api_key = os.environ.get("LMSTUDIO_API_KEY", "lm-studio")
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return await openai_complete_if_cache(
model, prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs
)
# Для эмбеддингов используем локальную sentence-transformers
# (qwen3-vl-4b не умеет embeddings, поэтому нужна отдельная модель)
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
async def embed_func(texts: list[str]) -> list[list[float]]:
import numpy as np
embeddings = embed_model.encode(texts, convert_to_numpy=True)
return embeddings.tolist()
embed_cfg = EmbeddingFunc(
embedding_dim=384,
max_token_size=512,
func=embed_func,
)
return llm_func, embed_cfg
def get_opencode_backend(model: str = "opencode/deepseek-v4-flash-free"):
"""OpenCode backend (DeepSeek V4 Flash Free, OpenAI-compatible API)."""
from openai import AsyncOpenAI
from sentence_transformers import SentenceTransformer
base_url = os.environ.get("OPENCODE_URL", "https://opencode.ai/zen/v1")
api_key = os.environ.get("OPENCODE_API_KEY", "")
client = AsyncOpenAI(base_url=base_url, api_key=api_key)
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
if history_messages:
messages.extend(history_messages)
messages.append({"role": "user", "content": prompt})
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=kwargs.get("temperature", 0.3),
max_tokens=kwargs.get("max_tokens", 1024),
)
return response.choices[0].message.content
# Эмбеддинги — локальные sentence-transformers
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
async def embed_func(texts: list[str]) -> list[list[float]]:
import numpy as np
embeddings = embed_model.encode(texts, convert_to_numpy=True)
return embeddings
embed_cfg = EmbeddingFunc(
embedding_dim=384,
max_token_size=512,
func=embed_func,
)
return llm_func, embed_cfg
# ------------------------------------------------------------------
# Подготовка текстов из OCR
# ------------------------------------------------------------------
def format_page_document(page: dict, page_idx: int, total: int, vlm_descs: dict = None) -> str:
"""Превращает данные одной страницы в текстовый документ для RAG."""
lines = []
lines.append(f"=== Чертеж: страница {page['page_number']} из {total} ===")
lines.append(f"Изображение: {page['image']}")
lines.append("")
# VLM-описание изображения (если есть)
if vlm_descs and page["image"] in vlm_descs:
lines.append("--- Описание модели зрения (VLM) ---")
lines.append(vlm_descs[page["image"]])
lines.append("")
# Текстовый слой PDF (если есть)
pdf_text = page.get("pdf_text_layer", "").strip()
if pdf_text:
lines.append("--- Текстовый слой PDF ---")
lines.append(pdf_text[:2000]) # обрежем, чтобы не перегружать
lines.append("")
# OCR результаты
ocr_lines = page.get("ocr_lines", [])
if ocr_lines:
lines.append("--- Распознанный текст (OCR) ---")
for entry in ocr_lines:
txt = entry["text"].strip()
conf = entry.get("confidence", 0.0)
bbox = entry.get("bbox", [])
lines.append(f' "{txt}" (confidence={conf:.2f}, bbox={bbox})')
lines.append("")
# Извлечь и явно перечислить сущности (помогает LLM построить граф)
entities = extract_entities_from_page(ocr_lines)
if any(entities.values()):
lines.append("--- Извлеченные сущности ---")
if entities["floors"]:
lines.append(f"Этажи: {', '.join(entities['floors'])}")
if entities["axes"]:
lines.append(f"Оси: {', '.join(entities['axes'])}")
if entities["dimensions"]:
lines.append(f"Размеры: {', '.join(entities['dimensions'])}")
if entities["rooms"]:
lines.append(f"Помещения: {', '.join(entities['rooms'])}")
lines.append("")
return "\n".join(lines)
def extract_entities_from_page(ocr_lines: list[dict]) -> dict:
"""Извлекает сущности из OCR-строк страницы."""
import re
floors, axes, dims, rooms = set(), set(), set(), set()
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
axis_re = re.compile(r"\b([А-Я])\b")
num_axis_re = re.compile(r"\b(\d{1,2})\b")
dim_re = re.compile(r"\b(\d{3,5})\s*м?[мм]?\b")
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
for entry in ocr_lines:
t = entry["text"]
for m in floor_re.finditer(t):
floors.add(m.group(0))
for m in axis_re.finditer(t):
axes.add(m.group(1))
for m in num_axis_re.finditer(t):
v = m.group(1)
if v.isdigit() and 1 <= int(v) <= 50:
axes.add(v)
for m in dim_re.finditer(t):
v = m.group(1)
if 100 <= int(v) <= 20000:
dims.add(v + " мм")
for m in room_re.finditer(t):
rooms.add(m.group(0))
return {
"floors": sorted(floors),
"axes": sorted(axes),
"dimensions": sorted(dims),
"rooms": sorted(rooms),
}
def build_documents(ocr_path: Path, vlm_desc_path: Optional[Path] = None) -> list[str]:
"""Собирает список текстовых документов из full_ocr_results.json и VLM-описаний."""
data = json.loads(ocr_path.read_text(encoding="utf-8"))
pages = data["pages"]
vlm_descs = {}
if vlm_desc_path and vlm_desc_path.exists():
vlm_descs = json.loads(vlm_desc_path.read_text(encoding="utf-8"))
print(f" VLM-описания загружены: {len(vlm_descs)} шт.")
docs = []
total = len(pages)
for i, page in enumerate(pages):
doc_text = format_page_document(page, i, total, vlm_descs)
docs.append(doc_text)
return docs
# ------------------------------------------------------------------
# Основной pipeline
# ------------------------------------------------------------------
async def index_folder(folder: Path, backend: str, model: str, embed_model: Optional[str], use_vlm: bool = False):
ocr_path = folder / "full_ocr_results.json"
if not ocr_path.exists():
print(f"[ERR] Не найден {ocr_path}")
sys.exit(1)
cache_dir = folder / "lightrag_cache"
cache_dir.mkdir(exist_ok=True)
vlm_desc_path = folder / "vlm_descriptions.json" if use_vlm else None
print(f"[1/4] Загрузка OCR-данных из {ocr_path}")
docs = build_documents(ocr_path, vlm_desc_path)
print(f" Страниц: {len(docs)}")
print(f"[2/4] Инициализация LightRAG (backend={backend}, model={model})")
if backend == "openai":
llm_func, embed_cfg = get_openai_backend(model)
elif backend == "ollama":
embed = embed_model or "nomic-embed-text"
llm_func, embed_cfg = get_ollama_backend(model, embed)
elif backend == "lmstudio":
llm_func, embed_cfg = get_lmstudio_backend(model)
elif backend == "opencode":
llm_func, embed_cfg = get_opencode_backend(model)
else:
print(f"[ERR] Неизвестный backend: {backend}")
sys.exit(1)
rag = LightRAG(
working_dir=str(cache_dir),
llm_model_func=llm_func,
embedding_func=embed_cfg,
)
print(f"[2.5/4] Инициализация хранилищ...")
await rag.initialize_storages()
print(f"[3/4] Вставка документов в индекс...")
for i, doc_text in enumerate(docs, 1):
print(f" [{i}/{len(docs)}] Страница {i}")
await rag.ainsert(doc_text)
print(f"[4/4] Готово. Индекс сохранен в {cache_dir}")
print(f" Для запросов используйте: python rag_query.py \"{folder}\"")
def main():
parser = argparse.ArgumentParser(description="Построение LightRAG индекса из OCR")
parser.add_argument("folder", help="Папка с full_ocr_results.json")
parser.add_argument("--backend", choices=["openai", "ollama", "lmstudio", "opencode"], default="openai",
help="LLM backend (default: openai)")
parser.add_argument("--model", default="gpt-4o-mini",
help="Название модели LLM (default: gpt-4o-mini, opencode: deepseek-v4-flash-free)")
parser.add_argument("--embed-model", default=None,
help="Модель эмбеддингов (только для Ollama, default: nomic-embed-text)")
parser.add_argument("--vlm-desc", action="store_true",
help="Использовать VLM-описания из vlm_descriptions.json")
args = parser.parse_args()
folder = Path(args.folder)
model = args.model
if args.backend in ("ollama", "lmstudio") and model == "gpt-4o-mini":
model = "qwen2.5:14b"
if args.backend == "opencode" and model == "gpt-4o-mini":
model = "mimo-v2.5-free"
asyncio.run(index_folder(folder, args.backend, model, args.embed_model, args.vlm_desc))
if __name__ == "__main__":
main()