- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets
353 lines
14 KiB
Python
353 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Построение LightRAG индекса из OCR-результатов чертежей.
|
||
|
||
Использование:
|
||
# С OpenAI (требуется OPENAI_API_KEY)
|
||
python rag_indexer.py <output_folder> --backend openai
|
||
|
||
# С локальной моделью через Ollama (требуется запущенный Ollama)
|
||
python rag_indexer.py <output_folder> --backend ollama --model qwen2.5:14b
|
||
|
||
# Через LM Studio (локальный OpenAI-compatible сервер)
|
||
python rag_indexer.py <output_folder> --backend lmstudio --model qwen2.5:14b
|
||
|
||
# С VLM-описаниями (предварительно запустить vlm_describer.py)
|
||
python rag_indexer.py <output_folder> --backend lmstudio --vlm-desc
|
||
|
||
Результат: папка <output_folder>/lightrag_cache с графом знаний.
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import asyncio
|
||
import argparse
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
# ------------------------------------------------------------------
|
||
# LightRAG imports
|
||
# ------------------------------------------------------------------
|
||
try:
|
||
from lightrag import LightRAG, QueryParam
|
||
from lightrag.utils import EmbeddingFunc
|
||
except ImportError as e:
|
||
print(f"[ERR] LightRAG не установлен: {e}")
|
||
print(" Установите: pip install lightrag-hku")
|
||
sys.exit(1)
|
||
|
||
# ------------------------------------------------------------------
|
||
# LLM backends
|
||
# ------------------------------------------------------------------
|
||
def get_openai_backend(model: str = "gpt-4o-mini"):
|
||
"""OpenAI backend (требует OPENAI_API_KEY)."""
|
||
from lightrag.llm import openai_complete_if_cache
|
||
from lightrag.llm import openai_embedding
|
||
|
||
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||
return await openai_complete_if_cache(
|
||
model, prompt,
|
||
system_prompt=system_prompt,
|
||
history_messages=history_messages,
|
||
**kwargs
|
||
)
|
||
|
||
async def embed_func(texts: list[str]) -> list[list[float]]:
|
||
return await openai_embedding(texts, model="text-embedding-3-small")
|
||
|
||
embed_cfg = EmbeddingFunc(
|
||
embedding_dim=1536,
|
||
max_token_size=8192,
|
||
func=embed_func,
|
||
)
|
||
return llm_func, embed_cfg
|
||
|
||
|
||
def get_ollama_backend(model: str = "qwen2.5:14b", embed_model: str = "nomic-embed-text"):
|
||
"""Ollama backend (требует запущенный ollama serve)."""
|
||
from lightrag.llm import ollama_model_complete, ollama_embedding
|
||
|
||
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||
return await ollama_model_complete(
|
||
model, prompt,
|
||
system_prompt=system_prompt,
|
||
history_messages=history_messages,
|
||
**kwargs
|
||
)
|
||
|
||
async def embed_func(texts: list[str]) -> list[list[float]]:
|
||
return await ollama_embedding(texts, model=embed_model)
|
||
|
||
# nomic-embed-text -> 768 dim, check your model
|
||
embed_cfg = EmbeddingFunc(
|
||
embedding_dim=768,
|
||
max_token_size=8192,
|
||
func=embed_func,
|
||
)
|
||
return llm_func, embed_cfg
|
||
|
||
|
||
def get_lmstudio_backend(model: str = "qwen2.5:14b"):
|
||
"""LM Studio backend (OpenAI-compatible API на http://127.0.0.1:1234/v1)."""
|
||
from lightrag.llm import openai_complete_if_cache
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
base_url = os.environ.get("LMSTUDIO_URL", "http://127.0.0.1:1234/v1")
|
||
api_key = os.environ.get("LMSTUDIO_API_KEY", "lm-studio")
|
||
|
||
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||
return await openai_complete_if_cache(
|
||
model, prompt,
|
||
system_prompt=system_prompt,
|
||
history_messages=history_messages,
|
||
api_key=api_key,
|
||
base_url=base_url,
|
||
**kwargs
|
||
)
|
||
|
||
# Для эмбеддингов используем локальную sentence-transformers
|
||
# (qwen3-vl-4b не умеет embeddings, поэтому нужна отдельная модель)
|
||
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
||
|
||
async def embed_func(texts: list[str]) -> list[list[float]]:
|
||
import numpy as np
|
||
embeddings = embed_model.encode(texts, convert_to_numpy=True)
|
||
return embeddings.tolist()
|
||
|
||
embed_cfg = EmbeddingFunc(
|
||
embedding_dim=384,
|
||
max_token_size=512,
|
||
func=embed_func,
|
||
)
|
||
return llm_func, embed_cfg
|
||
|
||
|
||
def get_opencode_backend(model: str = "opencode/deepseek-v4-flash-free"):
|
||
"""OpenCode backend (DeepSeek V4 Flash Free, OpenAI-compatible API)."""
|
||
from openai import AsyncOpenAI
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
base_url = os.environ.get("OPENCODE_URL", "https://opencode.ai/zen/v1")
|
||
api_key = os.environ.get("OPENCODE_API_KEY", "")
|
||
client = AsyncOpenAI(base_url=base_url, api_key=api_key)
|
||
|
||
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||
messages = []
|
||
if system_prompt:
|
||
messages.append({"role": "system", "content": system_prompt})
|
||
if history_messages:
|
||
messages.extend(history_messages)
|
||
messages.append({"role": "user", "content": prompt})
|
||
|
||
response = await client.chat.completions.create(
|
||
model=model,
|
||
messages=messages,
|
||
temperature=kwargs.get("temperature", 0.3),
|
||
max_tokens=kwargs.get("max_tokens", 1024),
|
||
)
|
||
return response.choices[0].message.content
|
||
|
||
# Эмбеддинги — локальные sentence-transformers
|
||
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
||
|
||
async def embed_func(texts: list[str]) -> list[list[float]]:
|
||
import numpy as np
|
||
embeddings = embed_model.encode(texts, convert_to_numpy=True)
|
||
return embeddings
|
||
|
||
embed_cfg = EmbeddingFunc(
|
||
embedding_dim=384,
|
||
max_token_size=512,
|
||
func=embed_func,
|
||
)
|
||
return llm_func, embed_cfg
|
||
|
||
|
||
# ------------------------------------------------------------------
|
||
# Подготовка текстов из OCR
|
||
# ------------------------------------------------------------------
|
||
def format_page_document(page: dict, page_idx: int, total: int, vlm_descs: dict = None) -> str:
|
||
"""Превращает данные одной страницы в текстовый документ для RAG."""
|
||
lines = []
|
||
lines.append(f"=== Чертеж: страница {page['page_number']} из {total} ===")
|
||
lines.append(f"Изображение: {page['image']}")
|
||
lines.append("")
|
||
|
||
# VLM-описание изображения (если есть)
|
||
if vlm_descs and page["image"] in vlm_descs:
|
||
lines.append("--- Описание модели зрения (VLM) ---")
|
||
lines.append(vlm_descs[page["image"]])
|
||
lines.append("")
|
||
|
||
# Текстовый слой PDF (если есть)
|
||
pdf_text = page.get("pdf_text_layer", "").strip()
|
||
if pdf_text:
|
||
lines.append("--- Текстовый слой PDF ---")
|
||
lines.append(pdf_text[:2000]) # обрежем, чтобы не перегружать
|
||
lines.append("")
|
||
|
||
# OCR результаты
|
||
ocr_lines = page.get("ocr_lines", [])
|
||
if ocr_lines:
|
||
lines.append("--- Распознанный текст (OCR) ---")
|
||
for entry in ocr_lines:
|
||
txt = entry["text"].strip()
|
||
conf = entry.get("confidence", 0.0)
|
||
bbox = entry.get("bbox", [])
|
||
lines.append(f' "{txt}" (confidence={conf:.2f}, bbox={bbox})')
|
||
lines.append("")
|
||
|
||
# Извлечь и явно перечислить сущности (помогает LLM построить граф)
|
||
entities = extract_entities_from_page(ocr_lines)
|
||
if any(entities.values()):
|
||
lines.append("--- Извлеченные сущности ---")
|
||
if entities["floors"]:
|
||
lines.append(f"Этажи: {', '.join(entities['floors'])}")
|
||
if entities["axes"]:
|
||
lines.append(f"Оси: {', '.join(entities['axes'])}")
|
||
if entities["dimensions"]:
|
||
lines.append(f"Размеры: {', '.join(entities['dimensions'])}")
|
||
if entities["rooms"]:
|
||
lines.append(f"Помещения: {', '.join(entities['rooms'])}")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def extract_entities_from_page(ocr_lines: list[dict]) -> dict:
|
||
"""Извлекает сущности из OCR-строк страницы."""
|
||
import re
|
||
floors, axes, dims, rooms = set(), set(), set(), set()
|
||
|
||
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
|
||
axis_re = re.compile(r"\b([А-Я])\b")
|
||
num_axis_re = re.compile(r"\b(\d{1,2})\b")
|
||
dim_re = re.compile(r"\b(\d{3,5})\s*м?[мм]?\b")
|
||
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
|
||
|
||
for entry in ocr_lines:
|
||
t = entry["text"]
|
||
for m in floor_re.finditer(t):
|
||
floors.add(m.group(0))
|
||
for m in axis_re.finditer(t):
|
||
axes.add(m.group(1))
|
||
for m in num_axis_re.finditer(t):
|
||
v = m.group(1)
|
||
if v.isdigit() and 1 <= int(v) <= 50:
|
||
axes.add(v)
|
||
for m in dim_re.finditer(t):
|
||
v = m.group(1)
|
||
if 100 <= int(v) <= 20000:
|
||
dims.add(v + " мм")
|
||
for m in room_re.finditer(t):
|
||
rooms.add(m.group(0))
|
||
|
||
return {
|
||
"floors": sorted(floors),
|
||
"axes": sorted(axes),
|
||
"dimensions": sorted(dims),
|
||
"rooms": sorted(rooms),
|
||
}
|
||
|
||
|
||
def build_documents(ocr_path: Path, vlm_desc_path: Optional[Path] = None) -> list[str]:
|
||
"""Собирает список текстовых документов из full_ocr_results.json и VLM-описаний."""
|
||
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
||
pages = data["pages"]
|
||
|
||
vlm_descs = {}
|
||
if vlm_desc_path and vlm_desc_path.exists():
|
||
vlm_descs = json.loads(vlm_desc_path.read_text(encoding="utf-8"))
|
||
print(f" VLM-описания загружены: {len(vlm_descs)} шт.")
|
||
|
||
docs = []
|
||
total = len(pages)
|
||
|
||
for i, page in enumerate(pages):
|
||
doc_text = format_page_document(page, i, total, vlm_descs)
|
||
docs.append(doc_text)
|
||
|
||
return docs
|
||
|
||
|
||
# ------------------------------------------------------------------
|
||
# Основной pipeline
|
||
# ------------------------------------------------------------------
|
||
async def index_folder(folder: Path, backend: str, model: str, embed_model: Optional[str], use_vlm: bool = False):
|
||
ocr_path = folder / "full_ocr_results.json"
|
||
if not ocr_path.exists():
|
||
print(f"[ERR] Не найден {ocr_path}")
|
||
sys.exit(1)
|
||
|
||
cache_dir = folder / "lightrag_cache"
|
||
cache_dir.mkdir(exist_ok=True)
|
||
|
||
vlm_desc_path = folder / "vlm_descriptions.json" if use_vlm else None
|
||
|
||
print(f"[1/4] Загрузка OCR-данных из {ocr_path}")
|
||
docs = build_documents(ocr_path, vlm_desc_path)
|
||
print(f" Страниц: {len(docs)}")
|
||
|
||
print(f"[2/4] Инициализация LightRAG (backend={backend}, model={model})")
|
||
if backend == "openai":
|
||
llm_func, embed_cfg = get_openai_backend(model)
|
||
elif backend == "ollama":
|
||
embed = embed_model or "nomic-embed-text"
|
||
llm_func, embed_cfg = get_ollama_backend(model, embed)
|
||
elif backend == "lmstudio":
|
||
llm_func, embed_cfg = get_lmstudio_backend(model)
|
||
elif backend == "opencode":
|
||
llm_func, embed_cfg = get_opencode_backend(model)
|
||
else:
|
||
print(f"[ERR] Неизвестный backend: {backend}")
|
||
sys.exit(1)
|
||
|
||
rag = LightRAG(
|
||
working_dir=str(cache_dir),
|
||
llm_model_func=llm_func,
|
||
embedding_func=embed_cfg,
|
||
)
|
||
|
||
print(f"[2.5/4] Инициализация хранилищ...")
|
||
await rag.initialize_storages()
|
||
|
||
print(f"[3/4] Вставка документов в индекс...")
|
||
for i, doc_text in enumerate(docs, 1):
|
||
print(f" [{i}/{len(docs)}] Страница {i}")
|
||
await rag.ainsert(doc_text)
|
||
|
||
print(f"[4/4] Готово. Индекс сохранен в {cache_dir}")
|
||
print(f" Для запросов используйте: python rag_query.py \"{folder}\"")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Построение LightRAG индекса из OCR")
|
||
parser.add_argument("folder", help="Папка с full_ocr_results.json")
|
||
parser.add_argument("--backend", choices=["openai", "ollama", "lmstudio", "opencode"], default="openai",
|
||
help="LLM backend (default: openai)")
|
||
parser.add_argument("--model", default="gpt-4o-mini",
|
||
help="Название модели LLM (default: gpt-4o-mini, opencode: deepseek-v4-flash-free)")
|
||
parser.add_argument("--embed-model", default=None,
|
||
help="Модель эмбеддингов (только для Ollama, default: nomic-embed-text)")
|
||
parser.add_argument("--vlm-desc", action="store_true",
|
||
help="Использовать VLM-описания из vlm_descriptions.json")
|
||
args = parser.parse_args()
|
||
|
||
folder = Path(args.folder)
|
||
model = args.model
|
||
if args.backend in ("ollama", "lmstudio") and model == "gpt-4o-mini":
|
||
model = "qwen2.5:14b"
|
||
if args.backend == "opencode" and model == "gpt-4o-mini":
|
||
model = "mimo-v2.5-free"
|
||
|
||
asyncio.run(index_folder(folder, args.backend, model, args.embed_model, args.vlm_desc))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|