- vlm_describer.py: objective extraction (beams, positions, GOSTs, dimensions) via qwen-vl-plus API. No error detection — only factual observation. - vlm_qc_checker.py: VLM-based QC (deprecated in favor of rules-only QC) - gost_dimension_validator.py: validate GOST references and dimension chains against known standards
229 lines
8.0 KiB
Python
229 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
VLM Describer — объективное извлечение структуры чертежа.
|
||
|
||
Отправляет PNG в qwen-vl-plus (DashScope API) с промптом на фактическое
|
||
описание содержимого. НЕ ищет ошибки, НЕ оценивает качество.
|
||
|
||
Результат: <output_folder>/vlm_extraction.json — структурированное описание
|
||
каждой страницы для использования в RAG и cross-verification.
|
||
|
||
Использование:
|
||
python vlm_describer.py <output_folder> [--model MODEL]
|
||
|
||
Требует DASHSCOPE_API_KEY в .env или окружении.
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import base64
|
||
import io
|
||
import re
|
||
from pathlib import Path
|
||
from typing import List, Dict, Tuple
|
||
from PIL import Image
|
||
from openai import OpenAI
|
||
|
||
# ------------------------------------------------------------------
|
||
# Конфигурация
|
||
# ------------------------------------------------------------------
|
||
API_KEY = None
|
||
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||
DEFAULT_MODEL = "qwen-vl-plus"
|
||
|
||
|
||
def _load_api_key():
|
||
global API_KEY
|
||
if API_KEY:
|
||
return API_KEY
|
||
env_candidates = [
|
||
Path(__file__).parent / ".env",
|
||
Path(__file__).parent.parent / ".env",
|
||
]
|
||
for env_path in env_candidates:
|
||
if env_path.exists():
|
||
for line in env_path.read_text().splitlines():
|
||
if line.startswith("DASHSCOPE_API_KEY="):
|
||
API_KEY = line.split("=", 1)[1].strip()
|
||
os.environ["DASHSCOPE_API_KEY"] = API_KEY
|
||
return API_KEY
|
||
API_KEY = os.environ.get("DASHSCOPE_API_KEY")
|
||
return API_KEY
|
||
|
||
|
||
EXTRACTION_PROMPT = (
|
||
"Ты — система распознавания чертежей. Опиши объективно, что изображено на этой странице. "
|
||
"НЕ ищи ошибки, НЕ оценивай качество. Просто перечисли факты.\n\n"
|
||
"Ответь СТРОГО в формате JSON (без markdown):\n"
|
||
"{\n"
|
||
' "page_type": "plan / section / elevation / specification / detail / general_view / table / unknown",\n'
|
||
' "title": "заголовок или null",\n'
|
||
' "beams": ["Балка Б-1"],\n'
|
||
' "positions": ["П-1"],\n'
|
||
' "gosts": ["ГОСТ ..."],\n'
|
||
' "description": "2-3 предложения о содержимом"\n'
|
||
"}\n\n"
|
||
"ПРАВИЛА:\n"
|
||
"- Только реальные элементы с чертежа, не придумывай\n"
|
||
"- Пустой массив [] если нет элементов данного типа\n"
|
||
"- НЕ включай массы из таблиц в размеры\n"
|
||
"- Описание — только факты, без оценок"
|
||
)
|
||
|
||
|
||
def resize_image(image_path: Path, max_size: int = 2048) -> Tuple[str, float, Tuple[int, int]]:
|
||
img = Image.open(image_path)
|
||
orig_w, orig_h = img.size
|
||
|
||
if max(orig_w, orig_h) <= max_size:
|
||
with open(image_path, "rb") as f:
|
||
b64 = base64.b64encode(f.read()).decode("utf-8")
|
||
return b64, 1.0, (orig_w, orig_h)
|
||
|
||
scale = max_size / max(orig_w, orig_h)
|
||
new_w = int(orig_w * scale)
|
||
new_h = int(orig_h * scale)
|
||
img_resized = img.resize((new_w, new_h), Image.LANCZOS)
|
||
|
||
buf = io.BytesIO()
|
||
img_resized.save(buf, format="PNG")
|
||
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
|
||
return b64, scale, (orig_w, orig_h)
|
||
|
||
|
||
def parse_json_response(text: str) -> Dict:
|
||
"""Парсит JSON из ответа VLM."""
|
||
text = text.strip()
|
||
if text.startswith("```"):
|
||
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
|
||
text = re.sub(r"\n?```$", "", text)
|
||
text = text.strip()
|
||
|
||
json_match = re.search(r'\{[\s\S]*\}', text)
|
||
if json_match:
|
||
text = json_match.group(0)
|
||
|
||
try:
|
||
return json.loads(text)
|
||
except json.JSONDecodeError as e:
|
||
print(f"[WARN] Не удалось распарсить JSON: {e}")
|
||
print(f"[WARN] Raw preview: {text[:500]}")
|
||
return {
|
||
"page_type": "unknown",
|
||
"title": None,
|
||
"elements": [],
|
||
"beams": [],
|
||
"positions": [],
|
||
"dimensions": [],
|
||
"gosts": [],
|
||
"tables": [],
|
||
"description": text[:500] if text else "",
|
||
"parse_error": str(e)
|
||
}
|
||
|
||
|
||
def describe_page(image_path: Path, model: str) -> Dict:
|
||
"""Отправляет PNG в qwen-vl API, получает структурированное описание."""
|
||
api_key = _load_api_key()
|
||
if not api_key:
|
||
raise RuntimeError("DASHSCOPE_API_KEY not found in .env or environment")
|
||
|
||
client = OpenAI(api_key=api_key, base_url=BASE_URL)
|
||
|
||
b64, scale, (orig_w, orig_h) = resize_image(image_path, max_size=2048)
|
||
data_url = f"data:image/png;base64,{b64}"
|
||
|
||
response = client.chat.completions.create(
|
||
model=model,
|
||
messages=[
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": EXTRACTION_PROMPT},
|
||
{"type": "image_url", "image_url": {"url": data_url}},
|
||
],
|
||
}
|
||
],
|
||
temperature=0.1, # низкая температура — меньше галлюцинаций
|
||
max_tokens=8192,
|
||
)
|
||
raw = response.choices[0].message.content.strip()
|
||
|
||
# Сохранить raw для отладки
|
||
debug_path = image_path.parent / f"{image_path.stem}_vlm_raw.txt"
|
||
debug_path.write_text(raw, encoding="utf-8")
|
||
|
||
result = parse_json_response(raw)
|
||
|
||
result["_meta"] = {
|
||
"image": image_path.name,
|
||
"original_size": [orig_w, orig_h],
|
||
"scale": scale,
|
||
}
|
||
|
||
return result
|
||
|
||
|
||
def run_vlm_describer(folder: Path, model: str = DEFAULT_MODEL):
|
||
"""Запускает VLM Describer для всех PNG в папке."""
|
||
png_files = sorted(folder.glob("page_*.png"))
|
||
if not png_files:
|
||
print(f"[ERR] В папке {folder} не найдены page_*.png")
|
||
sys.exit(1)
|
||
|
||
out_path = folder / "vlm_extraction.json"
|
||
extractions = {}
|
||
|
||
print(f"[INFO] VLM Describer: {len(png_files)} страниц")
|
||
print(f"[INFO] API: DashScope ({BASE_URL})")
|
||
print(f"[INFO] Модель: {model}\n")
|
||
|
||
for i, png in enumerate(png_files, 1):
|
||
print(f"[{i}/{len(png_files)}] {png.name} ...", end=" ", flush=True)
|
||
try:
|
||
data = describe_page(png, model)
|
||
extractions[png.name] = data
|
||
elem_count = len(data.get("beams", [])) + len(data.get("positions", [])) + len(data.get("gosts", []))
|
||
print(f"OK ({elem_count} элементов)")
|
||
except Exception as e:
|
||
print(f"ERR: {e}")
|
||
extractions[png.name] = {
|
||
"page_type": "unknown",
|
||
"error": str(e),
|
||
"elements": [],
|
||
"beams": [],
|
||
"positions": [],
|
||
"dimensions": [],
|
||
"gosts": [],
|
||
"tables": [],
|
||
"description": ""
|
||
}
|
||
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
json.dump(extractions, f, ensure_ascii=False, indent=2)
|
||
|
||
total_elems = sum(
|
||
len(v.get("beams", [])) + len(v.get("positions", [])) + len(v.get("gosts", []))
|
||
for v in extractions.values()
|
||
)
|
||
print(f"\n[OK] VLM extraction сохранён: {out_path}")
|
||
print(f" Страниц: {len(png_files)}, Всего элементов: {total_elems}")
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser(description="VLM Describer для чертежей")
|
||
parser.add_argument("folder", help="Папка с page_*.png")
|
||
parser.add_argument("--model", default=DEFAULT_MODEL, help="Имя модели (default: qwen-vl-plus)")
|
||
args = parser.parse_args()
|
||
|
||
folder = Path(args.folder)
|
||
run_vlm_describer(folder, args.model)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|