115 lines
4.0 KiB
Python
115 lines
4.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Генерация текстовых описаний PNG-страниц через VLM в LM Studio.
|
|||
|
|
|
|||
|
|
Требования:
|
|||
|
|
- Запущен LM Studio с загруженной моделью (например, qwen3-vl-4b)
|
|||
|
|
- Сервер: http://127.0.0.1:1234/v1
|
|||
|
|
|
|||
|
|
Использование:
|
|||
|
|
python vlm_describer.py <output_folder> [--prompt "..."] [--model MODEL]
|
|||
|
|
|
|||
|
|
Результат: <output_folder>/vlm_descriptions.json
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import base64
|
|||
|
|
import argparse
|
|||
|
|
from pathlib import Path
|
|||
|
|
from openai import OpenAI
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Конфигурация LM Studio
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
LMSTUDIO_URL = os.environ.get("LMSTUDIO_URL", "http://127.0.0.1:1234/v1")
|
|||
|
|
LMSTUDIO_KEY = os.environ.get("LMSTUDIO_API_KEY", "lm-studio")
|
|||
|
|
|
|||
|
|
DEFAULT_PROMPT = (
|
|||
|
|
"Опиши этот чертеж подробно. Укажи:\n"
|
|||
|
|
"- Какой это этаж (если видно)\n"
|
|||
|
|
"- Какие оси обозначены\n"
|
|||
|
|
"- Какие размеры указаны\n"
|
|||
|
|
"- Какие помещения/квартиры видны\n"
|
|||
|
|
"- Общую компоновку и заметные детали.\n"
|
|||
|
|
"Отвечай на русском языке."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
client = OpenAI(base_url=LMSTUDIO_URL, api_key=LMSTUDIO_KEY)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def encode_image(image_path: Path) -> str:
|
|||
|
|
with open(image_path, "rb") as f:
|
|||
|
|
return base64.b64encode(f.read()).decode("utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def describe_image(image_path: Path, model: str, prompt: str) -> str:
|
|||
|
|
"""Отправляет PNG в VLM и получает текстовое описание."""
|
|||
|
|
b64 = encode_image(image_path)
|
|||
|
|
data_url = f"data:image/png;base64,{b64}"
|
|||
|
|
|
|||
|
|
response = client.chat.completions.create(
|
|||
|
|
model=model,
|
|||
|
|
messages=[
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": [
|
|||
|
|
{"type": "text", "text": prompt},
|
|||
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
temperature=0.3,
|
|||
|
|
max_tokens=512, # 4B модель быстро устаёт, не гоним длину
|
|||
|
|
)
|
|||
|
|
return response.choices[0].message.content.strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_folder(folder: Path, model: str, prompt: str):
|
|||
|
|
"""Обрабатывает все PNG в папке и сохраняет описания."""
|
|||
|
|
png_files = sorted(folder.glob("page_*.png"))
|
|||
|
|
if not png_files:
|
|||
|
|
print(f"[ERR] В папке {folder} не найдены page_*.png")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
out_path = folder / "vlm_descriptions.json"
|
|||
|
|
descriptions = {}
|
|||
|
|
|
|||
|
|
print(f"[INFO] Найдено {len(png_files)} изображений")
|
|||
|
|
print(f"[INFO] LM Studio: {LMSTUDIO_URL}")
|
|||
|
|
print(f"[INFO] Модель: {model}\n")
|
|||
|
|
|
|||
|
|
for i, png in enumerate(png_files, 1):
|
|||
|
|
print(f"[{i}/{len(png_files)}] {png.name} ...", end=" ", flush=True)
|
|||
|
|
try:
|
|||
|
|
desc = describe_image(png, model, prompt)
|
|||
|
|
descriptions[png.name] = desc
|
|||
|
|
print(f"OK ({len(desc)} chars)")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"ERR: {e}")
|
|||
|
|
descriptions[png.name] = f"[ERROR] {e}"
|
|||
|
|
|
|||
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(descriptions, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"\n[OK] Сохранено: {out_path}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description="VLM-описания PNG через LM Studio")
|
|||
|
|
parser.add_argument("folder", help="Папка с page_*.png")
|
|||
|
|
parser.add_argument("--model", default="qwen/qwen3-vl-4b",
|
|||
|
|
help="Имя модели в LM Studio (default: qwen/qwen3-vl-4b)")
|
|||
|
|
parser.add_argument("--prompt", default=DEFAULT_PROMPT,
|
|||
|
|
help="Промпт для VLM")
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
folder = Path(args.folder)
|
|||
|
|
process_folder(folder, args.model, args.prompt)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|