- tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR
234 lines
8.3 KiB
Python
234 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
OCR через Alibaba Cloud qwen-vl-ocr API.
|
||
|
||
Использование:
|
||
from ocr_qwen import run_ocr
|
||
results = run_ocr(image_path)
|
||
|
||
Требует DASHSCOPE_API_KEY в .env
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import base64
|
||
import io
|
||
from pathlib import Path
|
||
from typing import List, Dict, Tuple
|
||
from PIL import Image
|
||
from openai import OpenAI
|
||
|
||
# Загрузить ключ
|
||
_API_KEY = None
|
||
_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||
_MODEL = "qwen-vl-ocr"
|
||
|
||
def _load_key():
|
||
global _API_KEY
|
||
if _API_KEY:
|
||
return _API_KEY
|
||
|
||
# Попробовать .env
|
||
env_candidates = [
|
||
Path(__file__).parent / ".env",
|
||
Path(__file__).parent.parent / ".env",
|
||
Path(__file__).parent.parent.parent / ".env",
|
||
]
|
||
for env_path in env_candidates:
|
||
if env_path.exists():
|
||
for line in env_path.read_text().splitlines():
|
||
if line.startswith("DASHSCOPE_API_KEY="):
|
||
_API_KEY = line.split("=", 1)[1].strip()
|
||
os.environ["DASHSCOPE_API_KEY"] = _API_KEY
|
||
return _API_KEY
|
||
|
||
_API_KEY = os.environ.get("DASHSCOPE_API_KEY")
|
||
return _API_KEY
|
||
|
||
|
||
def resize_image(image_path: Path, max_size: int = 2048) -> Tuple[str, float, Tuple[int, int]]:
|
||
"""
|
||
Уменьшает изображение до max_size по длинной стороне.
|
||
Возвращает: (base64_string, scale_factor, (orig_w, orig_h))
|
||
"""
|
||
img = Image.open(image_path)
|
||
orig_w, orig_h = img.size
|
||
|
||
# Если уже меньше — не менять
|
||
if max(orig_w, orig_h) <= max_size:
|
||
with open(image_path, "rb") as f:
|
||
b64 = base64.b64encode(f.read()).decode("utf-8")
|
||
return b64, 1.0, (orig_w, orig_h)
|
||
|
||
# Вычислить новый размер
|
||
scale = max_size / max(orig_w, orig_h)
|
||
new_w = int(orig_w * scale)
|
||
new_h = int(orig_h * scale)
|
||
|
||
img_resized = img.resize((new_w, new_h), Image.LANCZOS)
|
||
|
||
# Сохранить в буфер
|
||
buf = io.BytesIO()
|
||
img_resized.save(buf, format="PNG")
|
||
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
|
||
return b64, scale, (orig_w, orig_h)
|
||
|
||
|
||
def encode_image(image_path: Path) -> str:
|
||
with open(image_path, "rb") as f:
|
||
return base64.b64encode(f.read()).decode("utf-8")
|
||
|
||
|
||
def parse_qwen_response(raw_text: str) -> List[Dict]:
|
||
"""Парсит JSON из ответа qwen-vl-ocr."""
|
||
import re
|
||
text = raw_text.strip()
|
||
|
||
# Удалить markdown code blocks ```json ... ```
|
||
if text.startswith("```"):
|
||
lines = text.splitlines()
|
||
start = 0
|
||
end = len(lines)
|
||
for i, line in enumerate(lines):
|
||
if line.strip().startswith("```") and start == 0:
|
||
start = i + 1
|
||
elif line.strip() == "```" and start > 0:
|
||
end = i
|
||
break
|
||
text = "\n".join(lines[start:end]).strip()
|
||
|
||
# Робастный парсинг: извлекаем каждый объект отдельно через regex
|
||
results = []
|
||
# Шаблон: {"text": "...", "rotate_rect": [num, num, num, num, num]}
|
||
pattern = r'\{\s*"text":\s*"([^"]*)"\s*,\s*"rotate_rect":\s*\[\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*,\s*(-?\d+)\s*\]\s*\}'
|
||
|
||
for match in re.finditer(pattern, text):
|
||
txt = match.group(1)
|
||
x, y, w, h, angle = int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))
|
||
results.append({
|
||
"text": txt,
|
||
"rotate_rect": [x, y, w, h, angle]
|
||
})
|
||
|
||
if not results:
|
||
# Fallback: попробовать стандартный JSON парсинг
|
||
try:
|
||
json_match = re.search(r'\[[\s\S]*\]', text)
|
||
if json_match:
|
||
data = json.loads(json_match.group(0))
|
||
if isinstance(data, list):
|
||
return data
|
||
except Exception:
|
||
pass
|
||
print(f"[WARN] Regex parser не нашёл объекты, JSON тоже не распарсился")
|
||
print(f"[WARN] Text preview: {text[:200]}")
|
||
|
||
return results
|
||
|
||
|
||
def run_ocr(image_path: Path, verbose: bool = False) -> List[Dict]:
|
||
"""
|
||
Запускает qwen-vl-ocr на изображении.
|
||
|
||
Returns:
|
||
Список словарей: {
|
||
"text": str,
|
||
"bbox": [x1, y1, x2, y2, angle], # rotate_rect format
|
||
"confidence": float # estimated
|
||
}
|
||
"""
|
||
api_key = _load_key()
|
||
if not api_key:
|
||
raise RuntimeError("DASHSCOPE_API_KEY not found in .env or environment")
|
||
|
||
client = OpenAI(api_key=api_key, base_url=_BASE_URL)
|
||
|
||
# Уменьшить изображение для экономии токенов
|
||
b64, scale, (orig_w, orig_h) = resize_image(image_path, max_size=2048)
|
||
data_url = f"data:image/png;base64,{b64}"
|
||
|
||
if verbose:
|
||
orig_size = image_path.stat().st_size / 1024
|
||
print(f"[qwen-ocr] Отправка {image_path.name} (orig {orig_w}x{orig_h}, scale={scale:.2f}, {orig_size:.0f} KB)...", flush=True)
|
||
|
||
response = client.chat.completions.create(
|
||
model=_MODEL,
|
||
messages=[
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": (
|
||
"Распознай все текстовые элементы на этом чертеже. "
|
||
"Для каждого текста верни ОТДЕЛЬНЫЙ JSON-объект с полями: text, rotate_rect [x,y,w,h,angle]. "
|
||
"ВАЖНО: каждый текст — отдельный объект, без дублирующихся ключей в одном объекте. "
|
||
"Пример правильного формата:\n"
|
||
'[{"text": "Бетон", "rotate_rect": [100, 50, 30, 10, 0]}, {"text": "В30", "rotate_rect": [100, 65, 20, 10, 0]}]'
|
||
"\nОтветь строго в формате JSON-массива без markdown."
|
||
),
|
||
},
|
||
{"type": "image_url", "image_url": {"url": data_url}},
|
||
],
|
||
}
|
||
],
|
||
temperature=0.1,
|
||
max_tokens=8192,
|
||
)
|
||
|
||
raw = response.choices[0].message.content.strip()
|
||
|
||
# Сохранить raw для отладки
|
||
debug_path = image_path.parent / f"{image_path.stem}_qwen_raw.txt"
|
||
debug_path.write_text(raw, encoding="utf-8")
|
||
|
||
items = parse_qwen_response(raw)
|
||
|
||
# Конвертировать rotate_rect в наш формат, масштабируя обратно к оригиналу
|
||
results = []
|
||
for item in items:
|
||
rect = item.get("rotate_rect", [0, 0, 0, 0, 0])
|
||
if len(rect) >= 4:
|
||
x, y, w, h = rect[0], rect[1], rect[2], rect[3]
|
||
# Масштабировать обратно к оригинальному размеру
|
||
if scale != 1.0:
|
||
x = round(x / scale)
|
||
y = round(y / scale)
|
||
w = round(w / scale)
|
||
h = round(h / scale)
|
||
# bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
||
bbox = [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
|
||
else:
|
||
bbox = None
|
||
|
||
results.append({
|
||
"text": item.get("text", ""),
|
||
"bbox": bbox,
|
||
"confidence": 0.95, # qwen-vl-ocr не возвращает confidence, ставим высокий
|
||
"source": "qwen-vl-ocr"
|
||
})
|
||
|
||
if verbose:
|
||
print(f"[qwen-ocr] Найдено {len(results)} элементов")
|
||
|
||
return results
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python ocr_qwen.py <image.png>")
|
||
sys.exit(1)
|
||
|
||
image_path = Path(sys.argv[1])
|
||
results = run_ocr(image_path, verbose=True)
|
||
|
||
print(f"\nНайдено {len(results)} текстовых элементов:")
|
||
for r in results[:20]:
|
||
print(f" '{r['text']}' bbox={r['bbox']}")
|
||
|
||
if len(results) > 20:
|
||
print(f" ... и ещё {len(results) - 20}")
|