2026-05-29 06:54:37 +00:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
2026-06-01 09:29:58 +00:00
|
|
|
|
VLM Describer — объективное извлечение структуры чертежа.
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
Отправляет PNG в qwen-vl-plus (DashScope API) с промптом на фактическое
|
|
|
|
|
|
описание содержимого. НЕ ищет ошибки, НЕ оценивает качество.
|
|
|
|
|
|
|
|
|
|
|
|
Результат: <output_folder>/vlm_extraction.json — структурированное описание
|
|
|
|
|
|
каждой страницы для использования в RAG и cross-verification.
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
|
|
|
|
|
Использование:
|
2026-06-01 09:29:58 +00:00
|
|
|
|
python vlm_describer.py <output_folder> [--model MODEL]
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
Требует DASHSCOPE_API_KEY в .env или окружении.
|
2026-05-29 06:54:37 +00:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import json
|
|
|
|
|
|
import base64
|
2026-06-01 09:29:58 +00:00
|
|
|
|
import io
|
|
|
|
|
|
import re
|
2026-05-29 06:54:37 +00:00
|
|
|
|
from pathlib import Path
|
2026-06-01 09:29:58 +00:00
|
|
|
|
from typing import List, Dict, Tuple
|
|
|
|
|
|
from PIL import Image
|
2026-05-29 06:54:37 +00:00
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
2026-06-01 09:29:58 +00:00
|
|
|
|
# Конфигурация
|
2026-05-29 06:54:37 +00:00
|
|
|
|
# ------------------------------------------------------------------
|
2026-06-01 09:29:58 +00:00
|
|
|
|
API_KEY = None
|
|
|
|
|
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
|
|
|
|
|
DEFAULT_MODEL = "qwen-vl-plus"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_api_key():
|
|
|
|
|
|
global API_KEY
|
|
|
|
|
|
if API_KEY:
|
|
|
|
|
|
return API_KEY
|
|
|
|
|
|
env_candidates = [
|
|
|
|
|
|
Path(__file__).parent / ".env",
|
|
|
|
|
|
Path(__file__).parent.parent / ".env",
|
|
|
|
|
|
]
|
|
|
|
|
|
for env_path in env_candidates:
|
|
|
|
|
|
if env_path.exists():
|
|
|
|
|
|
for line in env_path.read_text().splitlines():
|
|
|
|
|
|
if line.startswith("DASHSCOPE_API_KEY="):
|
|
|
|
|
|
API_KEY = line.split("=", 1)[1].strip()
|
|
|
|
|
|
os.environ["DASHSCOPE_API_KEY"] = API_KEY
|
|
|
|
|
|
return API_KEY
|
|
|
|
|
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY")
|
|
|
|
|
|
return API_KEY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXTRACTION_PROMPT = (
|
|
|
|
|
|
"Ты — система распознавания чертежей. Опиши объективно, что изображено на этой странице. "
|
|
|
|
|
|
"НЕ ищи ошибки, НЕ оценивай качество. Просто перечисли факты.\n\n"
|
|
|
|
|
|
"Ответь СТРОГО в формате JSON (без markdown):\n"
|
|
|
|
|
|
"{\n"
|
|
|
|
|
|
' "page_type": "plan / section / elevation / specification / detail / general_view / table / unknown",\n'
|
|
|
|
|
|
' "title": "заголовок или null",\n'
|
|
|
|
|
|
' "beams": ["Балка Б-1"],\n'
|
|
|
|
|
|
' "positions": ["П-1"],\n'
|
|
|
|
|
|
' "gosts": ["ГОСТ ..."],\n'
|
|
|
|
|
|
' "description": "2-3 предложения о содержимом"\n'
|
|
|
|
|
|
"}\n\n"
|
|
|
|
|
|
"ПРАВИЛА:\n"
|
|
|
|
|
|
"- Только реальные элементы с чертежа, не придумывай\n"
|
|
|
|
|
|
"- Пустой массив [] если нет элементов данного типа\n"
|
|
|
|
|
|
"- НЕ включай массы из таблиц в размеры\n"
|
|
|
|
|
|
"- Описание — только факты, без оценок"
|
2026-05-29 06:54:37 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
def resize_image(image_path: Path, max_size: int = 2048) -> Tuple[str, float, Tuple[int, int]]:
|
|
|
|
|
|
img = Image.open(image_path)
|
|
|
|
|
|
orig_w, orig_h = img.size
|
|
|
|
|
|
|
|
|
|
|
|
if max(orig_w, orig_h) <= max_size:
|
|
|
|
|
|
with open(image_path, "rb") as f:
|
|
|
|
|
|
b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
|
|
|
|
return b64, 1.0, (orig_w, orig_h)
|
|
|
|
|
|
|
|
|
|
|
|
scale = max_size / max(orig_w, orig_h)
|
|
|
|
|
|
new_w = int(orig_w * scale)
|
|
|
|
|
|
new_h = int(orig_h * scale)
|
|
|
|
|
|
img_resized = img.resize((new_w, new_h), Image.LANCZOS)
|
|
|
|
|
|
|
|
|
|
|
|
buf = io.BytesIO()
|
|
|
|
|
|
img_resized.save(buf, format="PNG")
|
|
|
|
|
|
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
return b64, scale, (orig_w, orig_h)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_json_response(text: str) -> Dict:
|
|
|
|
|
|
"""Парсит JSON из ответа VLM."""
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
if text.startswith("```"):
|
|
|
|
|
|
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
|
|
|
|
|
|
text = re.sub(r"\n?```$", "", text)
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
json_match = re.search(r'\{[\s\S]*\}', text)
|
|
|
|
|
|
if json_match:
|
|
|
|
|
|
text = json_match.group(0)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
return json.loads(text)
|
|
|
|
|
|
except json.JSONDecodeError as e:
|
|
|
|
|
|
print(f"[WARN] Не удалось распарсить JSON: {e}")
|
|
|
|
|
|
print(f"[WARN] Raw preview: {text[:500]}")
|
|
|
|
|
|
return {
|
|
|
|
|
|
"page_type": "unknown",
|
|
|
|
|
|
"title": None,
|
|
|
|
|
|
"elements": [],
|
|
|
|
|
|
"beams": [],
|
|
|
|
|
|
"positions": [],
|
|
|
|
|
|
"dimensions": [],
|
|
|
|
|
|
"gosts": [],
|
|
|
|
|
|
"tables": [],
|
|
|
|
|
|
"description": text[:500] if text else "",
|
|
|
|
|
|
"parse_error": str(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def describe_page(image_path: Path, model: str) -> Dict:
|
|
|
|
|
|
"""Отправляет PNG в qwen-vl API, получает структурированное описание."""
|
|
|
|
|
|
api_key = _load_api_key()
|
|
|
|
|
|
if not api_key:
|
|
|
|
|
|
raise RuntimeError("DASHSCOPE_API_KEY not found in .env or environment")
|
|
|
|
|
|
|
|
|
|
|
|
client = OpenAI(api_key=api_key, base_url=BASE_URL)
|
|
|
|
|
|
|
|
|
|
|
|
b64, scale, (orig_w, orig_h) = resize_image(image_path, max_size=2048)
|
2026-05-29 06:54:37 +00:00
|
|
|
|
data_url = f"data:image/png;base64,{b64}"
|
2026-06-01 09:29:58 +00:00
|
|
|
|
|
2026-05-29 06:54:37 +00:00
|
|
|
|
response = client.chat.completions.create(
|
|
|
|
|
|
model=model,
|
|
|
|
|
|
messages=[
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": [
|
2026-06-01 09:29:58 +00:00
|
|
|
|
{"type": "text", "text": EXTRACTION_PROMPT},
|
2026-05-29 06:54:37 +00:00
|
|
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|
|
|
|
|
],
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
2026-06-01 09:29:58 +00:00
|
|
|
|
temperature=0.1, # низкая температура — меньше галлюцинаций
|
|
|
|
|
|
max_tokens=8192,
|
2026-05-29 06:54:37 +00:00
|
|
|
|
)
|
2026-06-01 09:29:58 +00:00
|
|
|
|
raw = response.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Сохранить raw для отладки
|
|
|
|
|
|
debug_path = image_path.parent / f"{image_path.stem}_vlm_raw.txt"
|
|
|
|
|
|
debug_path.write_text(raw, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
result = parse_json_response(raw)
|
|
|
|
|
|
|
|
|
|
|
|
result["_meta"] = {
|
|
|
|
|
|
"image": image_path.name,
|
|
|
|
|
|
"original_size": [orig_w, orig_h],
|
|
|
|
|
|
"scale": scale,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_vlm_describer(folder: Path, model: str = DEFAULT_MODEL):
|
|
|
|
|
|
"""Запускает VLM Describer для всех PNG в папке."""
|
2026-05-29 06:54:37 +00:00
|
|
|
|
png_files = sorted(folder.glob("page_*.png"))
|
|
|
|
|
|
if not png_files:
|
|
|
|
|
|
print(f"[ERR] В папке {folder} не найдены page_*.png")
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
out_path = folder / "vlm_extraction.json"
|
|
|
|
|
|
extractions = {}
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
print(f"[INFO] VLM Describer: {len(png_files)} страниц")
|
|
|
|
|
|
print(f"[INFO] API: DashScope ({BASE_URL})")
|
2026-05-29 06:54:37 +00:00
|
|
|
|
print(f"[INFO] Модель: {model}\n")
|
|
|
|
|
|
|
|
|
|
|
|
for i, png in enumerate(png_files, 1):
|
|
|
|
|
|
print(f"[{i}/{len(png_files)}] {png.name} ...", end=" ", flush=True)
|
|
|
|
|
|
try:
|
2026-06-01 09:29:58 +00:00
|
|
|
|
data = describe_page(png, model)
|
|
|
|
|
|
extractions[png.name] = data
|
|
|
|
|
|
elem_count = len(data.get("beams", [])) + len(data.get("positions", [])) + len(data.get("gosts", []))
|
|
|
|
|
|
print(f"OK ({elem_count} элементов)")
|
2026-05-29 06:54:37 +00:00
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"ERR: {e}")
|
2026-06-01 09:29:58 +00:00
|
|
|
|
extractions[png.name] = {
|
|
|
|
|
|
"page_type": "unknown",
|
|
|
|
|
|
"error": str(e),
|
|
|
|
|
|
"elements": [],
|
|
|
|
|
|
"beams": [],
|
|
|
|
|
|
"positions": [],
|
|
|
|
|
|
"dimensions": [],
|
|
|
|
|
|
"gosts": [],
|
|
|
|
|
|
"tables": [],
|
|
|
|
|
|
"description": ""
|
|
|
|
|
|
}
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
|
|
|
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
2026-06-01 09:29:58 +00:00
|
|
|
|
json.dump(extractions, f, ensure_ascii=False, indent=2)
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
2026-06-01 09:29:58 +00:00
|
|
|
|
total_elems = sum(
|
|
|
|
|
|
len(v.get("beams", [])) + len(v.get("positions", [])) + len(v.get("gosts", []))
|
|
|
|
|
|
for v in extractions.values()
|
|
|
|
|
|
)
|
|
|
|
|
|
print(f"\n[OK] VLM extraction сохранён: {out_path}")
|
|
|
|
|
|
print(f" Страниц: {len(png_files)}, Всего элементов: {total_elems}")
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2026-06-01 09:29:58 +00:00
|
|
|
|
import argparse
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="VLM Describer для чертежей")
|
2026-05-29 06:54:37 +00:00
|
|
|
|
parser.add_argument("folder", help="Папка с page_*.png")
|
2026-06-01 09:29:58 +00:00
|
|
|
|
parser.add_argument("--model", default=DEFAULT_MODEL, help="Имя модели (default: qwen-vl-plus)")
|
2026-05-29 06:54:37 +00:00
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
folder = Path(args.folder)
|
2026-06-01 09:29:58 +00:00
|
|
|
|
run_vlm_describer(folder, args.model)
|
2026-05-29 06:54:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|