opencode/test_qwen_ocr.py
Кирилл Блинов b5f7c6327e Add tiling OCR, preprocess and visualization tools
- tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition
- preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast
- visualize_dimensions.py: draw bounding boxes around detected dimension numbers
- compare_ocr.py: side-by-side visualization of normal vs tiling OCR results
- dimension_extractor.py: line-based dimension detection with pixel verification
- ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser
- test_qwen_ocr.py: standalone test for qwen OCR
- process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR
2026-06-01 12:29:26 +03:00

140 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Тест Alibaba Cloud DashScope qwen-vl-ocr на чертеже.
Использование:
python test_qwen_ocr.py <png_file>
"""
import os
import sys
import base64
import json
from pathlib import Path
from openai import OpenAI
# Загрузить ключ из .env (рядом со скриптом)
env_path = Path(__file__).parent / ".env"
API_KEY = None
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("DASHSCOPE_API_KEY="):
API_KEY = line.split("=", 1)[1].strip()
os.environ["DASHSCOPE_API_KEY"] = API_KEY
break
if not API_KEY:
API_KEY = os.environ.get("DASHSCOPE_API_KEY")
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
MODEL = "qwen-vl-ocr"
def encode_image(image_path: Path) -> str:
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def test_ocr(image_path: Path):
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
b64 = encode_image(image_path)
data_url = f"data:image/png;base64,{b64}"
print(f"Отправляем {image_path.name} в qwen-vl-ocr...")
print(f"Размер файла: {image_path.stat().st_size / 1024 / 1024:.1f} MB")
response = client.chat.completions.create(
model=MODEL,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": (
"Распознай все текстовые элементы на этом чертеже. "
"Для каждого текста укажи:\n"
"- сам текст\n"
"- координаты bbox (x1,y1,x2,y2)\n"
"- confidence (если доступен)\n"
"Ответь в формате JSON-массива."
),
},
{"type": "image_url", "image_url": {"url": data_url}},
],
}
],
temperature=0.1,
max_tokens=2048,
)
raw = response.choices[0].message.content
print("\n=== ОТВЕТ МОДЕЛИ ===")
print(raw[:2000])
print("=" * 50)
# Сохранить результат
out_path = image_path.parent / f"qwen_ocr_result_{image_path.stem}.json"
with open(out_path, "w", encoding="utf-8") as f:
f.write(raw)
print(f"\n[OK] Сохранено: {out_path}")
def describe_image(image_path: Path):
"""Просто описание того, что модель видит на чертеже."""
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
b64 = encode_image(image_path)
data_url = f"data:image/png;base64,{b64}"
print(f"\nОтправляем {image_path.name} на описание...")
response = client.chat.completions.create(
model=MODEL,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": (
"Опиши подробно, что ты видишь на этом изображении. "
"Чертеж здания или что-то другое? Какие элементы видны? "
"Размеры, текст, линии, оси — всё, что различимо."
),
},
{"type": "image_url", "image_url": {"url": data_url}},
],
}
],
temperature=0.3,
max_tokens=1024,
)
desc = response.choices[0].message.content
print("\n=== ОПИСАНИЕ ===")
print(desc)
print("=" * 50)
return desc
def main():
if len(sys.argv) < 2:
print("Usage: python test_qwen_ocr.py <png_file> [--describe]")
sys.exit(1)
image_path = Path(sys.argv[1])
if not image_path.exists():
print(f"[ERR] Файл не найден: {image_path}")
sys.exit(1)
if "--describe" in sys.argv:
describe_image(image_path)
else:
test_ocr(image_path)
if __name__ == "__main__":
main()