353 lines
14 KiB
Python
353 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Построение LightRAG индекса из OCR-результатов чертежей.
|
|||
|
|
|
|||
|
|
Использование:
|
|||
|
|
# С OpenAI (требуется OPENAI_API_KEY)
|
|||
|
|
python rag_indexer.py <output_folder> --backend openai
|
|||
|
|
|
|||
|
|
# С локальной моделью через Ollama (требуется запущенный Ollama)
|
|||
|
|
python rag_indexer.py <output_folder> --backend ollama --model qwen2.5:14b
|
|||
|
|
|
|||
|
|
# Через LM Studio (локальный OpenAI-compatible сервер)
|
|||
|
|
python rag_indexer.py <output_folder> --backend lmstudio --model qwen2.5:14b
|
|||
|
|
|
|||
|
|
# С VLM-описаниями (предварительно запустить vlm_describer.py)
|
|||
|
|
python rag_indexer.py <output_folder> --backend lmstudio --vlm-desc
|
|||
|
|
|
|||
|
|
Результат: папка <output_folder>/lightrag_cache с графом знаний.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import asyncio
|
|||
|
|
import argparse
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Optional
|
|||
|
|
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
load_dotenv()
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# LightRAG imports
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
try:
|
|||
|
|
from lightrag import LightRAG, QueryParam
|
|||
|
|
from lightrag.utils import EmbeddingFunc
|
|||
|
|
except ImportError as e:
|
|||
|
|
print(f"[ERR] LightRAG не установлен: {e}")
|
|||
|
|
print(" Установите: pip install lightrag-hku")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# LLM backends
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def get_openai_backend(model: str = "gpt-4o-mini"):
|
|||
|
|
"""OpenAI backend (требует OPENAI_API_KEY)."""
|
|||
|
|
from lightrag.llm import openai_complete_if_cache
|
|||
|
|
from lightrag.llm import openai_embedding
|
|||
|
|
|
|||
|
|
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
|||
|
|
return await openai_complete_if_cache(
|
|||
|
|
model, prompt,
|
|||
|
|
system_prompt=system_prompt,
|
|||
|
|
history_messages=history_messages,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def embed_func(texts: list[str]) -> list[list[float]]:
|
|||
|
|
return await openai_embedding(texts, model="text-embedding-3-small")
|
|||
|
|
|
|||
|
|
embed_cfg = EmbeddingFunc(
|
|||
|
|
embedding_dim=1536,
|
|||
|
|
max_token_size=8192,
|
|||
|
|
func=embed_func,
|
|||
|
|
)
|
|||
|
|
return llm_func, embed_cfg
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_ollama_backend(model: str = "qwen2.5:14b", embed_model: str = "nomic-embed-text"):
|
|||
|
|
"""Ollama backend (требует запущенный ollama serve)."""
|
|||
|
|
from lightrag.llm import ollama_model_complete, ollama_embedding
|
|||
|
|
|
|||
|
|
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
|||
|
|
return await ollama_model_complete(
|
|||
|
|
model, prompt,
|
|||
|
|
system_prompt=system_prompt,
|
|||
|
|
history_messages=history_messages,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def embed_func(texts: list[str]) -> list[list[float]]:
|
|||
|
|
return await ollama_embedding(texts, model=embed_model)
|
|||
|
|
|
|||
|
|
# nomic-embed-text -> 768 dim, check your model
|
|||
|
|
embed_cfg = EmbeddingFunc(
|
|||
|
|
embedding_dim=768,
|
|||
|
|
max_token_size=8192,
|
|||
|
|
func=embed_func,
|
|||
|
|
)
|
|||
|
|
return llm_func, embed_cfg
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_lmstudio_backend(model: str = "qwen2.5:14b"):
|
|||
|
|
"""LM Studio backend (OpenAI-compatible API на http://127.0.0.1:1234/v1)."""
|
|||
|
|
from lightrag.llm import openai_complete_if_cache
|
|||
|
|
from sentence_transformers import SentenceTransformer
|
|||
|
|
|
|||
|
|
base_url = os.environ.get("LMSTUDIO_URL", "http://127.0.0.1:1234/v1")
|
|||
|
|
api_key = os.environ.get("LMSTUDIO_API_KEY", "lm-studio")
|
|||
|
|
|
|||
|
|
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
|||
|
|
return await openai_complete_if_cache(
|
|||
|
|
model, prompt,
|
|||
|
|
system_prompt=system_prompt,
|
|||
|
|
history_messages=history_messages,
|
|||
|
|
api_key=api_key,
|
|||
|
|
base_url=base_url,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Для эмбеддингов используем локальную sentence-transformers
|
|||
|
|
# (qwen3-vl-4b не умеет embeddings, поэтому нужна отдельная модель)
|
|||
|
|
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
|||
|
|
|
|||
|
|
async def embed_func(texts: list[str]) -> list[list[float]]:
|
|||
|
|
import numpy as np
|
|||
|
|
embeddings = embed_model.encode(texts, convert_to_numpy=True)
|
|||
|
|
return embeddings.tolist()
|
|||
|
|
|
|||
|
|
embed_cfg = EmbeddingFunc(
|
|||
|
|
embedding_dim=384,
|
|||
|
|
max_token_size=512,
|
|||
|
|
func=embed_func,
|
|||
|
|
)
|
|||
|
|
return llm_func, embed_cfg
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_opencode_backend(model: str = "opencode/deepseek-v4-flash-free"):
|
|||
|
|
"""OpenCode backend (DeepSeek V4 Flash Free, OpenAI-compatible API)."""
|
|||
|
|
from openai import AsyncOpenAI
|
|||
|
|
from sentence_transformers import SentenceTransformer
|
|||
|
|
|
|||
|
|
base_url = os.environ.get("OPENCODE_URL", "https://opencode.ai/zen/v1")
|
|||
|
|
api_key = os.environ.get("OPENCODE_API_KEY", "")
|
|||
|
|
client = AsyncOpenAI(base_url=base_url, api_key=api_key)
|
|||
|
|
|
|||
|
|
async def llm_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
|||
|
|
messages = []
|
|||
|
|
if system_prompt:
|
|||
|
|
messages.append({"role": "system", "content": system_prompt})
|
|||
|
|
if history_messages:
|
|||
|
|
messages.extend(history_messages)
|
|||
|
|
messages.append({"role": "user", "content": prompt})
|
|||
|
|
|
|||
|
|
response = await client.chat.completions.create(
|
|||
|
|
model=model,
|
|||
|
|
messages=messages,
|
|||
|
|
temperature=kwargs.get("temperature", 0.3),
|
|||
|
|
max_tokens=kwargs.get("max_tokens", 1024),
|
|||
|
|
)
|
|||
|
|
return response.choices[0].message.content
|
|||
|
|
|
|||
|
|
# Эмбеддинги — локальные sentence-transformers
|
|||
|
|
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
|||
|
|
|
|||
|
|
async def embed_func(texts: list[str]) -> list[list[float]]:
|
|||
|
|
import numpy as np
|
|||
|
|
embeddings = embed_model.encode(texts, convert_to_numpy=True)
|
|||
|
|
return embeddings
|
|||
|
|
|
|||
|
|
embed_cfg = EmbeddingFunc(
|
|||
|
|
embedding_dim=384,
|
|||
|
|
max_token_size=512,
|
|||
|
|
func=embed_func,
|
|||
|
|
)
|
|||
|
|
return llm_func, embed_cfg
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Подготовка текстов из OCR
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def format_page_document(page: dict, page_idx: int, total: int, vlm_descs: dict = None) -> str:
|
|||
|
|
"""Превращает данные одной страницы в текстовый документ для RAG."""
|
|||
|
|
lines = []
|
|||
|
|
lines.append(f"=== Чертеж: страница {page['page_number']} из {total} ===")
|
|||
|
|
lines.append(f"Изображение: {page['image']}")
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
# VLM-описание изображения (если есть)
|
|||
|
|
if vlm_descs and page["image"] in vlm_descs:
|
|||
|
|
lines.append("--- Описание модели зрения (VLM) ---")
|
|||
|
|
lines.append(vlm_descs[page["image"]])
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
# Текстовый слой PDF (если есть)
|
|||
|
|
pdf_text = page.get("pdf_text_layer", "").strip()
|
|||
|
|
if pdf_text:
|
|||
|
|
lines.append("--- Текстовый слой PDF ---")
|
|||
|
|
lines.append(pdf_text[:2000]) # обрежем, чтобы не перегружать
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
# OCR результаты
|
|||
|
|
ocr_lines = page.get("ocr_lines", [])
|
|||
|
|
if ocr_lines:
|
|||
|
|
lines.append("--- Распознанный текст (OCR) ---")
|
|||
|
|
for entry in ocr_lines:
|
|||
|
|
txt = entry["text"].strip()
|
|||
|
|
conf = entry.get("confidence", 0.0)
|
|||
|
|
bbox = entry.get("bbox", [])
|
|||
|
|
lines.append(f' "{txt}" (confidence={conf:.2f}, bbox={bbox})')
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
# Извлечь и явно перечислить сущности (помогает LLM построить граф)
|
|||
|
|
entities = extract_entities_from_page(ocr_lines)
|
|||
|
|
if any(entities.values()):
|
|||
|
|
lines.append("--- Извлеченные сущности ---")
|
|||
|
|
if entities["floors"]:
|
|||
|
|
lines.append(f"Этажи: {', '.join(entities['floors'])}")
|
|||
|
|
if entities["axes"]:
|
|||
|
|
lines.append(f"Оси: {', '.join(entities['axes'])}")
|
|||
|
|
if entities["dimensions"]:
|
|||
|
|
lines.append(f"Размеры: {', '.join(entities['dimensions'])}")
|
|||
|
|
if entities["rooms"]:
|
|||
|
|
lines.append(f"Помещения: {', '.join(entities['rooms'])}")
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_entities_from_page(ocr_lines: list[dict]) -> dict:
|
|||
|
|
"""Извлекает сущности из OCR-строк страницы."""
|
|||
|
|
import re
|
|||
|
|
floors, axes, dims, rooms = set(), set(), set(), set()
|
|||
|
|
|
|||
|
|
floor_re = re.compile(r"(\d+)[-\s]?й\s*этаж", re.I)
|
|||
|
|
axis_re = re.compile(r"\b([А-Я])\b")
|
|||
|
|
num_axis_re = re.compile(r"\b(\d{1,2})\b")
|
|||
|
|
dim_re = re.compile(r"\b(\d{3,5})\s*м?[мм]?\b")
|
|||
|
|
room_re = re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+|[Пп]омещение\s*\d+)", re.I)
|
|||
|
|
|
|||
|
|
for entry in ocr_lines:
|
|||
|
|
t = entry["text"]
|
|||
|
|
for m in floor_re.finditer(t):
|
|||
|
|
floors.add(m.group(0))
|
|||
|
|
for m in axis_re.finditer(t):
|
|||
|
|
axes.add(m.group(1))
|
|||
|
|
for m in num_axis_re.finditer(t):
|
|||
|
|
v = m.group(1)
|
|||
|
|
if v.isdigit() and 1 <= int(v) <= 50:
|
|||
|
|
axes.add(v)
|
|||
|
|
for m in dim_re.finditer(t):
|
|||
|
|
v = m.group(1)
|
|||
|
|
if 100 <= int(v) <= 20000:
|
|||
|
|
dims.add(v + " мм")
|
|||
|
|
for m in room_re.finditer(t):
|
|||
|
|
rooms.add(m.group(0))
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"floors": sorted(floors),
|
|||
|
|
"axes": sorted(axes),
|
|||
|
|
"dimensions": sorted(dims),
|
|||
|
|
"rooms": sorted(rooms),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def build_documents(ocr_path: Path, vlm_desc_path: Optional[Path] = None) -> list[str]:
|
|||
|
|
"""Собирает список текстовых документов из full_ocr_results.json и VLM-описаний."""
|
|||
|
|
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
|||
|
|
pages = data["pages"]
|
|||
|
|
|
|||
|
|
vlm_descs = {}
|
|||
|
|
if vlm_desc_path and vlm_desc_path.exists():
|
|||
|
|
vlm_descs = json.loads(vlm_desc_path.read_text(encoding="utf-8"))
|
|||
|
|
print(f" VLM-описания загружены: {len(vlm_descs)} шт.")
|
|||
|
|
|
|||
|
|
docs = []
|
|||
|
|
total = len(pages)
|
|||
|
|
|
|||
|
|
for i, page in enumerate(pages):
|
|||
|
|
doc_text = format_page_document(page, i, total, vlm_descs)
|
|||
|
|
docs.append(doc_text)
|
|||
|
|
|
|||
|
|
return docs
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Основной pipeline
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
async def index_folder(folder: Path, backend: str, model: str, embed_model: Optional[str], use_vlm: bool = False):
|
|||
|
|
ocr_path = folder / "full_ocr_results.json"
|
|||
|
|
if not ocr_path.exists():
|
|||
|
|
print(f"[ERR] Не найден {ocr_path}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
cache_dir = folder / "lightrag_cache"
|
|||
|
|
cache_dir.mkdir(exist_ok=True)
|
|||
|
|
|
|||
|
|
vlm_desc_path = folder / "vlm_descriptions.json" if use_vlm else None
|
|||
|
|
|
|||
|
|
print(f"[1/4] Загрузка OCR-данных из {ocr_path}")
|
|||
|
|
docs = build_documents(ocr_path, vlm_desc_path)
|
|||
|
|
print(f" Страниц: {len(docs)}")
|
|||
|
|
|
|||
|
|
print(f"[2/4] Инициализация LightRAG (backend={backend}, model={model})")
|
|||
|
|
if backend == "openai":
|
|||
|
|
llm_func, embed_cfg = get_openai_backend(model)
|
|||
|
|
elif backend == "ollama":
|
|||
|
|
embed = embed_model or "nomic-embed-text"
|
|||
|
|
llm_func, embed_cfg = get_ollama_backend(model, embed)
|
|||
|
|
elif backend == "lmstudio":
|
|||
|
|
llm_func, embed_cfg = get_lmstudio_backend(model)
|
|||
|
|
elif backend == "opencode":
|
|||
|
|
llm_func, embed_cfg = get_opencode_backend(model)
|
|||
|
|
else:
|
|||
|
|
print(f"[ERR] Неизвестный backend: {backend}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
rag = LightRAG(
|
|||
|
|
working_dir=str(cache_dir),
|
|||
|
|
llm_model_func=llm_func,
|
|||
|
|
embedding_func=embed_cfg,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print(f"[2.5/4] Инициализация хранилищ...")
|
|||
|
|
await rag.initialize_storages()
|
|||
|
|
|
|||
|
|
print(f"[3/4] Вставка документов в индекс...")
|
|||
|
|
for i, doc_text in enumerate(docs, 1):
|
|||
|
|
print(f" [{i}/{len(docs)}] Страница {i}")
|
|||
|
|
await rag.ainsert(doc_text)
|
|||
|
|
|
|||
|
|
print(f"[4/4] Готово. Индекс сохранен в {cache_dir}")
|
|||
|
|
print(f" Для запросов используйте: python rag_query.py \"{folder}\"")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description="Построение LightRAG индекса из OCR")
|
|||
|
|
parser.add_argument("folder", help="Папка с full_ocr_results.json")
|
|||
|
|
parser.add_argument("--backend", choices=["openai", "ollama", "lmstudio", "opencode"], default="openai",
|
|||
|
|
help="LLM backend (default: openai)")
|
|||
|
|
parser.add_argument("--model", default="gpt-4o-mini",
|
|||
|
|
help="Название модели LLM (default: gpt-4o-mini, opencode: deepseek-v4-flash-free)")
|
|||
|
|
parser.add_argument("--embed-model", default=None,
|
|||
|
|
help="Модель эмбеддингов (только для Ollama, default: nomic-embed-text)")
|
|||
|
|
parser.add_argument("--vlm-desc", action="store_true",
|
|||
|
|
help="Использовать VLM-описания из vlm_descriptions.json")
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
folder = Path(args.folder)
|
|||
|
|
model = args.model
|
|||
|
|
if args.backend in ("ollama", "lmstudio") and model == "gpt-4o-mini":
|
|||
|
|
model = "qwen2.5:14b"
|
|||
|
|
if args.backend == "opencode" and model == "gpt-4o-mini":
|
|||
|
|
model = "mimo-v2.5-free"
|
|||
|
|
|
|||
|
|
asyncio.run(index_folder(folder, args.backend, model, args.embed_model, args.vlm_desc))
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|