- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets
262 lines
9.5 KiB
Python
262 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Тестовый скрипт распознавания PDF (чертежи / документы)
|
||
Стек: PyMuPDF (рендер) + RapidOCR (текст+координаты) + Docling (структура)
|
||
Работает на CPU.
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import fitz # PyMuPDF
|
||
from pathlib import Path
|
||
|
||
# ------------------------------------------------------------------
|
||
# 1. RapidOCR - лёгкий ONNX-OCR (входит в зависимости Docling)
|
||
# ------------------------------------------------------------------
|
||
try:
|
||
from rapidocr_onnxruntime import RapidOCR
|
||
engine = RapidOCR()
|
||
print("[OK] RapidOCR загружен")
|
||
except Exception as e:
|
||
print(f"[ERR] RapidOCR: {e}")
|
||
engine = None
|
||
|
||
# ------------------------------------------------------------------
|
||
# 2. Docling - структурный анализ PDF
|
||
# ------------------------------------------------------------------
|
||
try:
|
||
from docling.document_converter import DocumentConverter
|
||
converter = DocumentConverter()
|
||
print("[OK] Docling загружен")
|
||
except Exception as e:
|
||
print(f"[ERR] Docling: {e}")
|
||
converter = None
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3. Конфигурация
|
||
# ------------------------------------------------------------------
|
||
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
|
||
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
|
||
OUT_DIR = Path(r"D:\TEST docs\output")
|
||
OUT_DIR.mkdir(exist_ok=True)
|
||
|
||
MAX_PAGES_OCR = 10 # test mode: first 10 pages
|
||
DPI = 300 # разрешение рендера страниц
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4. Рендер страниц PDF -> PNG (PyMuPDF)
|
||
# ------------------------------------------------------------------
|
||
def render_pages(pdf_path: Path, out_dir: Path, max_pages: int = None, dpi: int = 300):
|
||
doc = fitz.open(pdf_path)
|
||
images = []
|
||
total = min(max_pages, len(doc)) if max_pages else len(doc)
|
||
for i in range(total):
|
||
page = doc.load_page(i)
|
||
# матрица для заданного DPI (72 dpi по умолчанию в PDF)
|
||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||
pix = page.get_pixmap(matrix=mat)
|
||
img_path = out_dir / f"page_{i+1:03d}.png"
|
||
pix.save(img_path)
|
||
images.append(img_path)
|
||
print(f" -> страница {i+1} отрендерена ({img_path.name})")
|
||
doc.close()
|
||
return images
|
||
|
||
# ------------------------------------------------------------------
|
||
# 5. OCR - извлечение текста с координатами (RapidOCR)
|
||
# ------------------------------------------------------------------
|
||
def run_ocr(image_paths: list[Path]):
|
||
if engine is None:
|
||
print("[WARN] OCR-dvigok nedostupen, propuskaem")
|
||
return {}
|
||
results = {}
|
||
for img_path in image_paths:
|
||
print(f" OCR: {img_path.name} ...")
|
||
res = engine(img_path)
|
||
# res - tuple: (results, timing)
|
||
# results: list of [box, text, confidence]
|
||
entries = []
|
||
if res and res[0] is not None:
|
||
for item in res[0]:
|
||
box, txt, score = item
|
||
entries.append({
|
||
"text": txt,
|
||
"confidence": float(score),
|
||
"bbox": box
|
||
})
|
||
results[img_path.name] = entries
|
||
print(f" naydeno {len(entries)} strok")
|
||
return results
|
||
|
||
# ------------------------------------------------------------------
|
||
# 6. Docling - структурный анализ (текстовый слой PDF)
|
||
# ------------------------------------------------------------------
|
||
def run_docling(pdf_path: Path, max_pages: int = 10):
|
||
if converter is None:
|
||
print("[WARN] Docling nedostupen, propuskaem")
|
||
return None
|
||
import shutil, tempfile, os
|
||
# Docling sometimes fails on cyrillic paths -> copy to temp latin path
|
||
# Also limit pages to avoid std::bad_alloc on CPU with large PDFs
|
||
tmp_dir = tempfile.mkdtemp(prefix="docling_")
|
||
tmp_path = Path(tmp_dir) / "input.pdf"
|
||
doc = fitz.open(pdf_path)
|
||
if len(doc) > max_pages:
|
||
new_doc = fitz.open()
|
||
for i in range(min(max_pages, len(doc))):
|
||
new_doc.insert_pdf(doc, from_page=i, to_page=i)
|
||
new_doc.save(tmp_path)
|
||
new_doc.close()
|
||
else:
|
||
shutil.copy2(pdf_path, tmp_path)
|
||
doc.close()
|
||
print(f" Docling: obrabotka (temp copy, first {max_pages} pages) ...")
|
||
try:
|
||
result = converter.convert(tmp_path)
|
||
doc_out = result.document
|
||
md = doc_out.export_to_markdown()
|
||
return md
|
||
except Exception as e:
|
||
print(f"[ERR] Docling: {e}")
|
||
return None
|
||
finally:
|
||
try:
|
||
os.remove(tmp_path)
|
||
os.rmdir(tmp_dir)
|
||
except Exception:
|
||
pass
|
||
print(f" Docling: обработка {pdf_path.name} ...")
|
||
try:
|
||
result = converter.convert(pdf_path)
|
||
doc = result.document
|
||
md = doc.export_to_markdown()
|
||
return md
|
||
except Exception as e:
|
||
print(f"[ERR] Docling: {e}")
|
||
return None
|
||
|
||
# ------------------------------------------------------------------
|
||
# 7. Кастомный парсер - простая эвристика для чертежей
|
||
# ------------------------------------------------------------------
|
||
def parse_drawing_text(ocr_data: dict):
|
||
"""
|
||
Ищет в OCR-результатах признаки этажей, осей, размеров.
|
||
Возвращает словарь с найденными сущностями.
|
||
"""
|
||
import re
|
||
found = {
|
||
"floors": set(),
|
||
"axes": set(),
|
||
"dimensions": set(),
|
||
"rooms": set(),
|
||
}
|
||
|
||
floor_patterns = [
|
||
re.compile(r"(\d+)[-\s]?й\s*этаж", re.I),
|
||
re.compile(r"этаж\s*(\d+)", re.I),
|
||
re.compile(r"этаж\s*([А-Я]\d?)", re.I),
|
||
re.compile(r"НК\d", re.I),
|
||
]
|
||
|
||
axis_patterns = [
|
||
re.compile(r"\b([А-Я])\b"), # одиночная буква
|
||
re.compile(r"\b(\d{1,2})\b"), # одиночная цифра/двузначное
|
||
re.compile(r"([А-Я])\s*[-–]\s*([А-Я])"), # А-Б
|
||
]
|
||
|
||
dim_patterns = [
|
||
re.compile(r"\b(\d{3,5})\s*м?[мм]?\b"), # 3600, 5400 и т.д.
|
||
]
|
||
|
||
room_patterns = [
|
||
re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+)", re.I),
|
||
re.compile(r"([Пп]омещение)\s*(\d+)", re.I),
|
||
]
|
||
|
||
for img_name, lines in ocr_data.items():
|
||
for entry in lines:
|
||
text = entry["text"].strip()
|
||
# этажи
|
||
for pat in floor_patterns:
|
||
m = pat.search(text)
|
||
if m:
|
||
found["floors"].add(m.group(0))
|
||
# оси
|
||
for pat in axis_patterns:
|
||
for m in pat.finditer(text):
|
||
g = m.group(0).replace(" ","").replace("-","").replace("–","")
|
||
if g and len(g) <= 3:
|
||
found["axes"].add(g)
|
||
# размеры
|
||
for pat in dim_patterns:
|
||
for m in pat.finditer(text):
|
||
val = m.group(1)
|
||
if 100 <= int(val) <= 50000:
|
||
found["dimensions"].add(val)
|
||
# помещения
|
||
for pat in room_patterns:
|
||
m = pat.search(text)
|
||
if m:
|
||
found["rooms"].add(m.group(0))
|
||
|
||
# очистка ложных осей (слишком общие цифры)
|
||
axes_clean = {a for a in found["axes"] if a.isalpha() or (a.isdigit() and 1 <= int(a) <= 50)}
|
||
found["axes"] = axes_clean
|
||
|
||
return {k: sorted(v) for k, v in found.items()}
|
||
|
||
# ------------------------------------------------------------------
|
||
# 8. main
|
||
# ------------------------------------------------------------------
|
||
def main():
|
||
if not PDF_PATH.exists():
|
||
print(f"[ERR] Файл не найден: {PDF_PATH}")
|
||
sys.exit(1)
|
||
|
||
doc_tmp = fitz.open(PDF_PATH)
|
||
page_count = doc_tmp.page_count
|
||
doc_tmp.close()
|
||
print(f"\n=== PDF: {PDF_PATH.name} ===")
|
||
print(f"Vsego stranits: {page_count}")
|
||
|
||
# --- 8.1 Render stranits ---
|
||
print("\n[1/4] Render stranits v izobrazheniya...")
|
||
images = render_pages(PDF_PATH, OUT_DIR, MAX_PAGES_OCR, DPI)
|
||
|
||
# --- 8.2 OCR ---
|
||
print("\n[2/4] OCR (RapidOCR)...")
|
||
ocr_results = run_ocr(images)
|
||
ocr_json = OUT_DIR / "ocr_results.json"
|
||
with open(ocr_json, "w", encoding="utf-8") as f:
|
||
json.dump(ocr_results, f, ensure_ascii=False, indent=2)
|
||
print(f" -> sohraneno {ocr_json}")
|
||
|
||
# --- 8.3 Docling ---
|
||
print("\n[3/4] Strukturnyy analiz (Docling)...")
|
||
md_text = run_docling(PDF_PATH, max_pages=MAX_PAGES_OCR)
|
||
if md_text:
|
||
md_path = OUT_DIR / "docling_output.md"
|
||
with open(md_path, "w", encoding="utf-8") as f:
|
||
f.write(md_text)
|
||
print(f" -> sohraneno {md_path}")
|
||
else:
|
||
print(" Docling ne dal rezultata")
|
||
|
||
# --- 8.4 Custom parser ---
|
||
print("\n[4/4] Custom parser chertezha...")
|
||
parsed = parse_drawing_text(ocr_results)
|
||
parsed_json = OUT_DIR / "parsed_entities.json"
|
||
with open(parsed_json, "w", encoding="utf-8") as f:
|
||
json.dump(parsed, f, ensure_ascii=False, indent=2)
|
||
print(f" -> sohraneno {parsed_json}")
|
||
print("\n--- Naydennye sushchnosti ---")
|
||
for k, v in parsed.items():
|
||
print(f" {k}: {v}")
|
||
|
||
print(f"\n=== Gotovo. Rezultaty v {OUT_DIR} ===")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|