opencode/process_pdf.py
Кирилл Блинов c756a5766b Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools
- Add rag_indexer.py: build LightRAG index from OCR with OpenCode API
- Add rag_query.py: query the knowledge graph
- Add vlm_describer.py: generate VLM descriptions via LM Studio
- Add test_model.py: quick check for LightRAG-compatible models
- Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline
- Fix rapidocr import (rapidocr_onnxruntime)
- Fix process_any_pdf.py paths for cross-platform use
- Add .env.example, README_RAG.md, AGENTS.md
- Update .gitignore for outputs and secrets
2026-05-29 09:54:37 +03:00

262 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Тестовый скрипт распознавания PDF (чертежи / документы)
Стек: PyMuPDF (рендер) + RapidOCR (текст+координаты) + Docling (структура)
Работает на CPU.
"""
import os
import sys
import json
import fitz # PyMuPDF
from pathlib import Path
# ------------------------------------------------------------------
# 1. RapidOCR - лёгкий ONNX-OCR (входит в зависимости Docling)
# ------------------------------------------------------------------
try:
from rapidocr_onnxruntime import RapidOCR
engine = RapidOCR()
print("[OK] RapidOCR загружен")
except Exception as e:
print(f"[ERR] RapidOCR: {e}")
engine = None
# ------------------------------------------------------------------
# 2. Docling - структурный анализ PDF
# ------------------------------------------------------------------
try:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
print("[OK] Docling загружен")
except Exception as e:
print(f"[ERR] Docling: {e}")
converter = None
# ------------------------------------------------------------------
# 3. Конфигурация
# ------------------------------------------------------------------
PDF_NAME = "Кронштадтский 16-18 НК1_ОСК (v3).pdf"
PDF_PATH = Path(r"D:\TEST docs") / PDF_NAME
OUT_DIR = Path(r"D:\TEST docs\output")
OUT_DIR.mkdir(exist_ok=True)
MAX_PAGES_OCR = 10 # test mode: first 10 pages
DPI = 300 # разрешение рендера страниц
# ------------------------------------------------------------------
# 4. Рендер страниц PDF -> PNG (PyMuPDF)
# ------------------------------------------------------------------
def render_pages(pdf_path: Path, out_dir: Path, max_pages: int = None, dpi: int = 300):
doc = fitz.open(pdf_path)
images = []
total = min(max_pages, len(doc)) if max_pages else len(doc)
for i in range(total):
page = doc.load_page(i)
# матрица для заданного DPI (72 dpi по умолчанию в PDF)
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_path = out_dir / f"page_{i+1:03d}.png"
pix.save(img_path)
images.append(img_path)
print(f" -> страница {i+1} отрендерена ({img_path.name})")
doc.close()
return images
# ------------------------------------------------------------------
# 5. OCR - извлечение текста с координатами (RapidOCR)
# ------------------------------------------------------------------
def run_ocr(image_paths: list[Path]):
if engine is None:
print("[WARN] OCR-dvigok nedostupen, propuskaem")
return {}
results = {}
for img_path in image_paths:
print(f" OCR: {img_path.name} ...")
res = engine(img_path)
# res - tuple: (results, timing)
# results: list of [box, text, confidence]
entries = []
if res and res[0] is not None:
for item in res[0]:
box, txt, score = item
entries.append({
"text": txt,
"confidence": float(score),
"bbox": box
})
results[img_path.name] = entries
print(f" naydeno {len(entries)} strok")
return results
# ------------------------------------------------------------------
# 6. Docling - структурный анализ (текстовый слой PDF)
# ------------------------------------------------------------------
def run_docling(pdf_path: Path, max_pages: int = 10):
if converter is None:
print("[WARN] Docling nedostupen, propuskaem")
return None
import shutil, tempfile, os
# Docling sometimes fails on cyrillic paths -> copy to temp latin path
# Also limit pages to avoid std::bad_alloc on CPU with large PDFs
tmp_dir = tempfile.mkdtemp(prefix="docling_")
tmp_path = Path(tmp_dir) / "input.pdf"
doc = fitz.open(pdf_path)
if len(doc) > max_pages:
new_doc = fitz.open()
for i in range(min(max_pages, len(doc))):
new_doc.insert_pdf(doc, from_page=i, to_page=i)
new_doc.save(tmp_path)
new_doc.close()
else:
shutil.copy2(pdf_path, tmp_path)
doc.close()
print(f" Docling: obrabotka (temp copy, first {max_pages} pages) ...")
try:
result = converter.convert(tmp_path)
doc_out = result.document
md = doc_out.export_to_markdown()
return md
except Exception as e:
print(f"[ERR] Docling: {e}")
return None
finally:
try:
os.remove(tmp_path)
os.rmdir(tmp_dir)
except Exception:
pass
print(f" Docling: обработка {pdf_path.name} ...")
try:
result = converter.convert(pdf_path)
doc = result.document
md = doc.export_to_markdown()
return md
except Exception as e:
print(f"[ERR] Docling: {e}")
return None
# ------------------------------------------------------------------
# 7. Кастомный парсер - простая эвристика для чертежей
# ------------------------------------------------------------------
def parse_drawing_text(ocr_data: dict):
"""
Ищет в OCR-результатах признаки этажей, осей, размеров.
Возвращает словарь с найденными сущностями.
"""
import re
found = {
"floors": set(),
"axes": set(),
"dimensions": set(),
"rooms": set(),
}
floor_patterns = [
re.compile(r"(\d+)[-\s]?й\s*этаж", re.I),
re.compile(r"этаж\s*(\d+)", re.I),
re.compile(r"этаж\s*([А-Я]\d?)", re.I),
re.compile(r"НК\d", re.I),
]
axis_patterns = [
re.compile(r"\b([А-Я])\b"), # одиночная буква
re.compile(r"\b(\d{1,2})\b"), # одиночная цифра/двузначное
re.compile(r"([А-Я])\s*[-]\s*([А-Я])"), # А
]
dim_patterns = [
re.compile(r"\b(\d{3,5})\s*м?[мм]?\b"), # 3600, 5400 и т.д.
]
room_patterns = [
re.compile(r"([Кк]вартира|[Кк]в\.?\s*\d+)", re.I),
re.compile(r"([Пп]омещение)\s*(\d+)", re.I),
]
for img_name, lines in ocr_data.items():
for entry in lines:
text = entry["text"].strip()
# этажи
for pat in floor_patterns:
m = pat.search(text)
if m:
found["floors"].add(m.group(0))
# оси
for pat in axis_patterns:
for m in pat.finditer(text):
g = m.group(0).replace(" ","").replace("-","").replace("","")
if g and len(g) <= 3:
found["axes"].add(g)
# размеры
for pat in dim_patterns:
for m in pat.finditer(text):
val = m.group(1)
if 100 <= int(val) <= 50000:
found["dimensions"].add(val)
# помещения
for pat in room_patterns:
m = pat.search(text)
if m:
found["rooms"].add(m.group(0))
# очистка ложных осей (слишком общие цифры)
axes_clean = {a for a in found["axes"] if a.isalpha() or (a.isdigit() and 1 <= int(a) <= 50)}
found["axes"] = axes_clean
return {k: sorted(v) for k, v in found.items()}
# ------------------------------------------------------------------
# 8. main
# ------------------------------------------------------------------
def main():
if not PDF_PATH.exists():
print(f"[ERR] Файл не найден: {PDF_PATH}")
sys.exit(1)
doc_tmp = fitz.open(PDF_PATH)
page_count = doc_tmp.page_count
doc_tmp.close()
print(f"\n=== PDF: {PDF_PATH.name} ===")
print(f"Vsego stranits: {page_count}")
# --- 8.1 Render stranits ---
print("\n[1/4] Render stranits v izobrazheniya...")
images = render_pages(PDF_PATH, OUT_DIR, MAX_PAGES_OCR, DPI)
# --- 8.2 OCR ---
print("\n[2/4] OCR (RapidOCR)...")
ocr_results = run_ocr(images)
ocr_json = OUT_DIR / "ocr_results.json"
with open(ocr_json, "w", encoding="utf-8") as f:
json.dump(ocr_results, f, ensure_ascii=False, indent=2)
print(f" -> sohraneno {ocr_json}")
# --- 8.3 Docling ---
print("\n[3/4] Strukturnyy analiz (Docling)...")
md_text = run_docling(PDF_PATH, max_pages=MAX_PAGES_OCR)
if md_text:
md_path = OUT_DIR / "docling_output.md"
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_text)
print(f" -> sohraneno {md_path}")
else:
print(" Docling ne dal rezultata")
# --- 8.4 Custom parser ---
print("\n[4/4] Custom parser chertezha...")
parsed = parse_drawing_text(ocr_results)
parsed_json = OUT_DIR / "parsed_entities.json"
with open(parsed_json, "w", encoding="utf-8") as f:
json.dump(parsed, f, ensure_ascii=False, indent=2)
print(f" -> sohraneno {parsed_json}")
print("\n--- Naydennye sushchnosti ---")
for k, v in parsed.items():
print(f" {k}: {v}")
print(f"\n=== Gotovo. Rezultaty v {OUT_DIR} ===")
if __name__ == "__main__":
main()