opencode/vlm_describer.py
Кирилл Блинов eaddf9f14b Add VLM tools: Describer, QC checker, and GOST validator
- vlm_describer.py: objective extraction (beams, positions, GOSTs, dimensions) via qwen-vl-plus API. No error detection — only factual observation.
- vlm_qc_checker.py: VLM-based QC (deprecated in favor of rules-only QC)
- gost_dimension_validator.py: validate GOST references and dimension chains against known standards
2026-06-01 12:29:58 +03:00

229 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
VLM Describer — объективное извлечение структуры чертежа.
Отправляет PNG в qwen-vl-plus (DashScope API) с промптом на фактическое
описание содержимого. НЕ ищет ошибки, НЕ оценивает качество.
Результат: <output_folder>/vlm_extraction.json — структурированное описание
каждой страницы для использования в RAG и cross-verification.
Использование:
python vlm_describer.py <output_folder> [--model MODEL]
Требует DASHSCOPE_API_KEY в .env или окружении.
"""
import os
import sys
import json
import base64
import io
import re
from pathlib import Path
from typing import List, Dict, Tuple
from PIL import Image
from openai import OpenAI
# ------------------------------------------------------------------
# Конфигурация
# ------------------------------------------------------------------
API_KEY = None
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
DEFAULT_MODEL = "qwen-vl-plus"
def _load_api_key():
global API_KEY
if API_KEY:
return API_KEY
env_candidates = [
Path(__file__).parent / ".env",
Path(__file__).parent.parent / ".env",
]
for env_path in env_candidates:
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("DASHSCOPE_API_KEY="):
API_KEY = line.split("=", 1)[1].strip()
os.environ["DASHSCOPE_API_KEY"] = API_KEY
return API_KEY
API_KEY = os.environ.get("DASHSCOPE_API_KEY")
return API_KEY
EXTRACTION_PROMPT = (
"Ты — система распознавания чертежей. Опиши объективно, что изображено на этой странице. "
"НЕ ищи ошибки, НЕ оценивай качество. Просто перечисли факты.\n\n"
"Ответь СТРОГО в формате JSON (без markdown):\n"
"{\n"
' "page_type": "plan / section / elevation / specification / detail / general_view / table / unknown",\n'
' "title": "заголовок или null",\n'
' "beams": ["Балка Б-1"],\n'
' "positions": ["П-1"],\n'
' "gosts": ["ГОСТ ..."],\n'
' "description": "2-3 предложения о содержимом"\n'
"}\n\n"
"ПРАВИЛА:\n"
"- Только реальные элементы с чертежа, не придумывай\n"
"- Пустой массив [] если нет элементов данного типа\n"
"- НЕ включай массы из таблиц в размеры\n"
"- Описание — только факты, без оценок"
)
def resize_image(image_path: Path, max_size: int = 2048) -> Tuple[str, float, Tuple[int, int]]:
img = Image.open(image_path)
orig_w, orig_h = img.size
if max(orig_w, orig_h) <= max_size:
with open(image_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode("utf-8")
return b64, 1.0, (orig_w, orig_h)
scale = max_size / max(orig_w, orig_h)
new_w = int(orig_w * scale)
new_h = int(orig_h * scale)
img_resized = img.resize((new_w, new_h), Image.LANCZOS)
buf = io.BytesIO()
img_resized.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
return b64, scale, (orig_w, orig_h)
def parse_json_response(text: str) -> Dict:
"""Парсит JSON из ответа VLM."""
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
text = re.sub(r"\n?```$", "", text)
text = text.strip()
json_match = re.search(r'\{[\s\S]*\}', text)
if json_match:
text = json_match.group(0)
try:
return json.loads(text)
except json.JSONDecodeError as e:
print(f"[WARN] Не удалось распарсить JSON: {e}")
print(f"[WARN] Raw preview: {text[:500]}")
return {
"page_type": "unknown",
"title": None,
"elements": [],
"beams": [],
"positions": [],
"dimensions": [],
"gosts": [],
"tables": [],
"description": text[:500] if text else "",
"parse_error": str(e)
}
def describe_page(image_path: Path, model: str) -> Dict:
"""Отправляет PNG в qwen-vl API, получает структурированное описание."""
api_key = _load_api_key()
if not api_key:
raise RuntimeError("DASHSCOPE_API_KEY not found in .env or environment")
client = OpenAI(api_key=api_key, base_url=BASE_URL)
b64, scale, (orig_w, orig_h) = resize_image(image_path, max_size=2048)
data_url = f"data:image/png;base64,{b64}"
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": EXTRACTION_PROMPT},
{"type": "image_url", "image_url": {"url": data_url}},
],
}
],
temperature=0.1, # низкая температура — меньше галлюцинаций
max_tokens=8192,
)
raw = response.choices[0].message.content.strip()
# Сохранить raw для отладки
debug_path = image_path.parent / f"{image_path.stem}_vlm_raw.txt"
debug_path.write_text(raw, encoding="utf-8")
result = parse_json_response(raw)
result["_meta"] = {
"image": image_path.name,
"original_size": [orig_w, orig_h],
"scale": scale,
}
return result
def run_vlm_describer(folder: Path, model: str = DEFAULT_MODEL):
"""Запускает VLM Describer для всех PNG в папке."""
png_files = sorted(folder.glob("page_*.png"))
if not png_files:
print(f"[ERR] В папке {folder} не найдены page_*.png")
sys.exit(1)
out_path = folder / "vlm_extraction.json"
extractions = {}
print(f"[INFO] VLM Describer: {len(png_files)} страниц")
print(f"[INFO] API: DashScope ({BASE_URL})")
print(f"[INFO] Модель: {model}\n")
for i, png in enumerate(png_files, 1):
print(f"[{i}/{len(png_files)}] {png.name} ...", end=" ", flush=True)
try:
data = describe_page(png, model)
extractions[png.name] = data
elem_count = len(data.get("beams", [])) + len(data.get("positions", [])) + len(data.get("gosts", []))
print(f"OK ({elem_count} элементов)")
except Exception as e:
print(f"ERR: {e}")
extractions[png.name] = {
"page_type": "unknown",
"error": str(e),
"elements": [],
"beams": [],
"positions": [],
"dimensions": [],
"gosts": [],
"tables": [],
"description": ""
}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(extractions, f, ensure_ascii=False, indent=2)
total_elems = sum(
len(v.get("beams", [])) + len(v.get("positions", [])) + len(v.get("gosts", []))
for v in extractions.values()
)
print(f"\n[OK] VLM extraction сохранён: {out_path}")
print(f" Страниц: {len(png_files)}, Всего элементов: {total_elems}")
def main():
import argparse
parser = argparse.ArgumentParser(description="VLM Describer для чертежей")
parser.add_argument("folder", help="Папка с page_*.png")
parser.add_argument("--model", default=DEFAULT_MODEL, help="Имя модели (default: qwen-vl-plus)")
args = parser.parse_args()
folder = Path(args.folder)
run_vlm_describer(folder, args.model)
if __name__ == "__main__":
main()