- tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR
140 lines
4.4 KiB
Python
140 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Тест Alibaba Cloud DashScope qwen-vl-ocr на чертеже.
|
||
|
||
Использование:
|
||
python test_qwen_ocr.py <png_file>
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import base64
|
||
import json
|
||
from pathlib import Path
|
||
from openai import OpenAI
|
||
|
||
# Загрузить ключ из .env (рядом со скриптом)
|
||
env_path = Path(__file__).parent / ".env"
|
||
API_KEY = None
|
||
if env_path.exists():
|
||
for line in env_path.read_text().splitlines():
|
||
if line.startswith("DASHSCOPE_API_KEY="):
|
||
API_KEY = line.split("=", 1)[1].strip()
|
||
os.environ["DASHSCOPE_API_KEY"] = API_KEY
|
||
break
|
||
|
||
if not API_KEY:
|
||
API_KEY = os.environ.get("DASHSCOPE_API_KEY")
|
||
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||
MODEL = "qwen-vl-ocr"
|
||
|
||
|
||
def encode_image(image_path: Path) -> str:
|
||
with open(image_path, "rb") as f:
|
||
return base64.b64encode(f.read()).decode("utf-8")
|
||
|
||
|
||
def test_ocr(image_path: Path):
|
||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||
|
||
b64 = encode_image(image_path)
|
||
data_url = f"data:image/png;base64,{b64}"
|
||
|
||
print(f"Отправляем {image_path.name} в qwen-vl-ocr...")
|
||
print(f"Размер файла: {image_path.stat().st_size / 1024 / 1024:.1f} MB")
|
||
|
||
response = client.chat.completions.create(
|
||
model=MODEL,
|
||
messages=[
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": (
|
||
"Распознай все текстовые элементы на этом чертеже. "
|
||
"Для каждого текста укажи:\n"
|
||
"- сам текст\n"
|
||
"- координаты bbox (x1,y1,x2,y2)\n"
|
||
"- confidence (если доступен)\n"
|
||
"Ответь в формате JSON-массива."
|
||
),
|
||
},
|
||
{"type": "image_url", "image_url": {"url": data_url}},
|
||
],
|
||
}
|
||
],
|
||
temperature=0.1,
|
||
max_tokens=2048,
|
||
)
|
||
|
||
raw = response.choices[0].message.content
|
||
print("\n=== ОТВЕТ МОДЕЛИ ===")
|
||
print(raw[:2000])
|
||
print("=" * 50)
|
||
|
||
# Сохранить результат
|
||
out_path = image_path.parent / f"qwen_ocr_result_{image_path.stem}.json"
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
f.write(raw)
|
||
print(f"\n[OK] Сохранено: {out_path}")
|
||
|
||
|
||
def describe_image(image_path: Path):
|
||
"""Просто описание того, что модель видит на чертеже."""
|
||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||
|
||
b64 = encode_image(image_path)
|
||
data_url = f"data:image/png;base64,{b64}"
|
||
|
||
print(f"\nОтправляем {image_path.name} на описание...")
|
||
|
||
response = client.chat.completions.create(
|
||
model=MODEL,
|
||
messages=[
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": (
|
||
"Опиши подробно, что ты видишь на этом изображении. "
|
||
"Чертеж здания или что-то другое? Какие элементы видны? "
|
||
"Размеры, текст, линии, оси — всё, что различимо."
|
||
),
|
||
},
|
||
{"type": "image_url", "image_url": {"url": data_url}},
|
||
],
|
||
}
|
||
],
|
||
temperature=0.3,
|
||
max_tokens=1024,
|
||
)
|
||
|
||
desc = response.choices[0].message.content
|
||
print("\n=== ОПИСАНИЕ ===")
|
||
print(desc)
|
||
print("=" * 50)
|
||
return desc
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python test_qwen_ocr.py <png_file> [--describe]")
|
||
sys.exit(1)
|
||
|
||
image_path = Path(sys.argv[1])
|
||
if not image_path.exists():
|
||
print(f"[ERR] Файл не найден: {image_path}")
|
||
sys.exit(1)
|
||
|
||
if "--describe" in sys.argv:
|
||
describe_image(image_path)
|
||
else:
|
||
test_ocr(image_path)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|