234 lines
8.3 KiB
Python
234 lines
8.3 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
OCR через Alibaba Cloud qwen-vl-ocr API.
|
|||
|
|
|
|||
|
|
Использование:
|
|||
|
|
from ocr_qwen import run_ocr
|
|||
|
|
results = run_ocr(image_path)
|
|||
|
|
|
|||
|
|
Требует DASHSCOPE_API_KEY в .env
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import json
|
|||
|
|
import base64
|
|||
|
|
import io
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import List, Dict, Tuple
|
|||
|
|
from PIL import Image
|
|||
|
|
from openai import OpenAI
|
|||
|
|
|
|||
|
|
# Загрузить ключ
|
|||
|
|
_API_KEY = None
|
|||
|
|
_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
|||
|
|
_MODEL = "qwen-vl-ocr"
|
|||
|
|
|
|||
|
|
def _load_key():
|
|||
|
|
global _API_KEY
|
|||
|
|
if _API_KEY:
|
|||
|
|
return _API_KEY
|
|||
|
|
|
|||
|
|
# Попробовать .env
|
|||
|
|
env_candidates = [
|
|||
|
|
Path(__file__).parent / ".env",
|
|||
|
|
Path(__file__).parent.parent / ".env",
|
|||
|
|
Path(__file__).parent.parent.parent / ".env",
|
|||
|
|
]
|
|||
|
|
for env_path in env_candidates:
|
|||
|
|
if env_path.exists():
|
|||
|
|
for line in env_path.read_text().splitlines():
|
|||
|
|
if line.startswith("DASHSCOPE_API_KEY="):
|
|||
|
|
_API_KEY = line.split("=", 1)[1].strip()
|
|||
|
|
os.environ["DASHSCOPE_API_KEY"] = _API_KEY
|
|||
|
|
return _API_KEY
|
|||
|
|
|
|||
|
|
_API_KEY = os.environ.get("DASHSCOPE_API_KEY")
|
|||
|
|
return _API_KEY
|
|||
|
|
|
|||
|
|
|
|||
|
|
def resize_image(image_path: Path, max_size: int = 2048) -> Tuple[str, float, Tuple[int, int]]:
|
|||
|
|
"""
|
|||
|
|
Уменьшает изображение до max_size по длинной стороне.
|
|||
|
|
Возвращает: (base64_string, scale_factor, (orig_w, orig_h))
|
|||
|
|
"""
|
|||
|
|
img = Image.open(image_path)
|
|||
|
|
orig_w, orig_h = img.size
|
|||
|
|
|
|||
|
|
# Если уже меньше — не менять
|
|||
|
|
if max(orig_w, orig_h) <= max_size:
|
|||
|
|
with open(image_path, "rb") as f:
|
|||
|
|
b64 = base64.b64encode(f.read()).decode("utf-8")
|
|||
|
|
return b64, 1.0, (orig_w, orig_h)
|
|||
|
|
|
|||
|
|
# Вычислить новый размер
|
|||
|
|
scale = max_size / max(orig_w, orig_h)
|
|||
|
|
new_w = int(orig_w * scale)
|
|||
|
|
new_h = int(orig_h * scale)
|
|||
|
|
|
|||
|
|
img_resized = img.resize((new_w, new_h), Image.LANCZOS)
|
|||
|
|
|
|||
|
|
# Сохранить в буфер
|
|||
|
|
buf = io.BytesIO()
|
|||
|
|
img_resized.save(buf, format="PNG")
|
|||
|
|
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
|||
|
|
|
|||
|
|
return b64, scale, (orig_w, orig_h)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def encode_image(image_path: Path) -> str:
|
|||
|
|
with open(image_path, "rb") as f:
|
|||
|
|
return base64.b64encode(f.read()).decode("utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_qwen_response(raw_text: str) -> List[Dict]:
|
|||
|
|
"""Парсит JSON из ответа qwen-vl-ocr."""
|
|||
|
|
import re
|
|||
|
|
text = raw_text.strip()
|
|||
|
|
|
|||
|
|
# Удалить markdown code blocks ```json ... ```
|
|||
|
|
if text.startswith("```"):
|
|||
|
|
lines = text.splitlines()
|
|||
|
|
start = 0
|
|||
|
|
end = len(lines)
|
|||
|
|
for i, line in enumerate(lines):
|
|||
|
|
if line.strip().startswith("```") and start == 0:
|
|||
|
|
start = i + 1
|
|||
|
|
elif line.strip() == "```" and start > 0:
|
|||
|
|
end = i
|
|||
|
|
break
|
|||
|
|
text = "\n".join(lines[start:end]).strip()
|
|||
|
|
|
|||
|
|
# Робастный парсинг: извлекаем каждый объект отдельно через regex
|
|||
|
|
results = []
|
|||
|
|
# Шаблон: {"text": "...", "rotate_rect": [num, num, num, num, num]}
|
|||
|
|
pattern = r'\{\s*"text":\s*"([^"]*)"\s*,\s*"rotate_rect":\s*\[\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*\]\s*\}'
|
|||
|
|
|
|||
|
|
for match in re.finditer(pattern, text):
|
|||
|
|
txt = match.group(1)
|
|||
|
|
x, y, w, h, angle = int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))
|
|||
|
|
results.append({
|
|||
|
|
"text": txt,
|
|||
|
|
"rotate_rect": [x, y, w, h, angle]
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
if not results:
|
|||
|
|
# Fallback: попробовать стандартный JSON парсинг
|
|||
|
|
try:
|
|||
|
|
json_match = re.search(r'\[[\s\S]*\]', text)
|
|||
|
|
if json_match:
|
|||
|
|
data = json.loads(json_match.group(0))
|
|||
|
|
if isinstance(data, list):
|
|||
|
|
return data
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
print(f"[WARN] Regex parser не нашёл объекты, JSON тоже не распарсился")
|
|||
|
|
print(f"[WARN] Text preview: {text[:200]}")
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_ocr(image_path: Path, verbose: bool = False) -> List[Dict]:
|
|||
|
|
"""
|
|||
|
|
Запускает qwen-vl-ocr на изображении.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Список словарей: {
|
|||
|
|
"text": str,
|
|||
|
|
"bbox": [x1, y1, x2, y2, angle], # rotate_rect format
|
|||
|
|
"confidence": float # estimated
|
|||
|
|
}
|
|||
|
|
"""
|
|||
|
|
api_key = _load_key()
|
|||
|
|
if not api_key:
|
|||
|
|
raise RuntimeError("DASHSCOPE_API_KEY not found in .env or environment")
|
|||
|
|
|
|||
|
|
client = OpenAI(api_key=api_key, base_url=_BASE_URL)
|
|||
|
|
|
|||
|
|
# Уменьшить изображение для экономии токенов
|
|||
|
|
b64, scale, (orig_w, orig_h) = resize_image(image_path, max_size=2048)
|
|||
|
|
data_url = f"data:image/png;base64,{b64}"
|
|||
|
|
|
|||
|
|
if verbose:
|
|||
|
|
orig_size = image_path.stat().st_size / 1024
|
|||
|
|
print(f"[qwen-ocr] Отправка {image_path.name} (orig {orig_w}x{orig_h}, scale={scale:.2f}, {orig_size:.0f} KB)...", flush=True)
|
|||
|
|
|
|||
|
|
response = client.chat.completions.create(
|
|||
|
|
model=_MODEL,
|
|||
|
|
messages=[
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": [
|
|||
|
|
{
|
|||
|
|
"type": "text",
|
|||
|
|
"text": (
|
|||
|
|
"Распознай все текстовые элементы на этом чертеже. "
|
|||
|
|
"Для каждого текста верни ОТДЕЛЬНЫЙ JSON-объект с полями: text, rotate_rect [x,y,w,h,angle]. "
|
|||
|
|
"ВАЖНО: каждый текст — отдельный объект, без дублирующихся ключей в одном объекте. "
|
|||
|
|
"Пример правильного формата:\n"
|
|||
|
|
'[{"text": "Бетон", "rotate_rect": [100, 50, 30, 10, 0]}, {"text": "В30", "rotate_rect": [100, 65, 20, 10, 0]}]'
|
|||
|
|
"\nОтветь строго в формате JSON-массива без markdown."
|
|||
|
|
),
|
|||
|
|
},
|
|||
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
temperature=0.1,
|
|||
|
|
max_tokens=8192,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
raw = response.choices[0].message.content.strip()
|
|||
|
|
|
|||
|
|
# Сохранить raw для отладки
|
|||
|
|
debug_path = image_path.parent / f"{image_path.stem}_qwen_raw.txt"
|
|||
|
|
debug_path.write_text(raw, encoding="utf-8")
|
|||
|
|
|
|||
|
|
items = parse_qwen_response(raw)
|
|||
|
|
|
|||
|
|
# Конвертировать rotate_rect в наш формат, масштабируя обратно к оригиналу
|
|||
|
|
results = []
|
|||
|
|
for item in items:
|
|||
|
|
rect = item.get("rotate_rect", [0, 0, 0, 0, 0])
|
|||
|
|
if len(rect) >= 4:
|
|||
|
|
x, y, w, h = rect[0], rect[1], rect[2], rect[3]
|
|||
|
|
# Масштабировать обратно к оригинальному размеру
|
|||
|
|
if scale != 1.0:
|
|||
|
|
x = round(x / scale)
|
|||
|
|
y = round(y / scale)
|
|||
|
|
w = round(w / scale)
|
|||
|
|
h = round(h / scale)
|
|||
|
|
# bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
|||
|
|
bbox = [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
|
|||
|
|
else:
|
|||
|
|
bbox = None
|
|||
|
|
|
|||
|
|
results.append({
|
|||
|
|
"text": item.get("text", ""),
|
|||
|
|
"bbox": bbox,
|
|||
|
|
"confidence": 0.95, # qwen-vl-ocr не возвращает confidence, ставим высокий
|
|||
|
|
"source": "qwen-vl-ocr"
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
if verbose:
|
|||
|
|
print(f"[qwen-ocr] Найдено {len(results)} элементов")
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import sys
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print("Usage: python ocr_qwen.py <image.png>")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
image_path = Path(sys.argv[1])
|
|||
|
|
results = run_ocr(image_path, verbose=True)
|
|||
|
|
|
|||
|
|
print(f"\nНайдено {len(results)} текстовых элементов:")
|
|||
|
|
for r in results[:20]:
|
|||
|
|
print(f" '{r['text']}' bbox={r['bbox']}")
|
|||
|
|
|
|||
|
|
if len(results) > 20:
|
|||
|
|
print(f" ... и ещё {len(results) - 20}")
|