opencode/tiling_ocr.py
Кирилл Блинов b5f7c6327e Add tiling OCR, preprocess and visualization tools
- tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition
- preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast
- visualize_dimensions.py: draw bounding boxes around detected dimension numbers
- compare_ocr.py: side-by-side visualization of normal vs tiling OCR results
- dimension_extractor.py: line-based dimension detection with pixel verification
- ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser
- test_qwen_ocr.py: standalone test for qwen OCR
- process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR
2026-06-01 12:29:26 +03:00

158 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Tiling OCR для больших чертежей.
Разрезает PNG на перекрывающиеся кропы, прогоняет OCR на каждом,
объединяет результаты с дедупликацией.
Эффект: каждый кроп масштабирован "крупнее" для OCR — мелкий текст
находится на бОльшем % площади кропа.
"""
import sys
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple
from PIL import Image
from rapidocr_onnxruntime import RapidOCR
def make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200) -> List[Tuple[int, int, Image.Image]]:
"""
Генерирует кропы с перекрытием.
Возвращает: [(offset_x, offset_y, cropped_image), ...]
"""
w, h = img.size
tiles = []
step = tile_size - overlap
for y in range(0, h, step):
for x in range(0, w, step):
x2 = min(x + tile_size, w)
y2 = min(y + tile_size, h)
crop = img.crop((x, y, x2, y2))
tiles.append((x, y, crop))
return tiles
def iou_bbox(a: List, b: List) -> float:
"""IoU двух bbox в формате [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]."""
def _get_rect(box):
if isinstance(box[0], list):
xs = [p[0] for p in box]
ys = [p[1] for p in box]
return min(xs), min(ys), max(xs), max(ys)
else:
return box[0], box[1], box[2], box[3]
ax1, ay1, ax2, ay2 = _get_rect(a)
bx1, by1, bx2, by2 = _get_rect(b)
ix1 = max(ax1, bx1)
iy1 = max(ay1, by1)
ix2 = min(ax2, bx2)
iy2 = min(ay2, by2)
if ix2 <= ix1 or iy2 <= iy1:
return 0.0
inter = (ix2 - ix1) * (iy2 - iy1)
area_a = (ax2 - ax1) * (ay2 - ay1)
area_b = (bx2 - bx1) * (by2 - by1)
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def run_tiling_ocr(png_path: Path, tile_size: int = 2000, overlap: int = 200, conf_threshold: float = 0.5):
"""Основная функция."""
print(f"[INFO] Загрузка {png_path.name}...")
img = Image.open(png_path)
print(f"[INFO] Размер: {img.size}")
tiles = make_tiles(img, tile_size, overlap)
print(f"[INFO] Кропов: {len(tiles)}")
engine = RapidOCR()
all_results = []
for i, (off_x, off_y, crop) in enumerate(tiles, 1):
# Временно сохранить кроп
tmp_path = f"/tmp/tile_{i:03d}.png"
crop.save(tmp_path)
print(f" [{i}/{len(tiles)}] tile @ ({off_x}, {off_y}) size {crop.size} ...", end=" ", flush=True)
res = engine(tmp_path)
tile_lines = 0
if res and res[0]:
for item in res[0]:
box, txt, score = item
if score < conf_threshold:
continue
# Сдвинуть bbox на offset кропа
shifted_box = []
for pt in box:
shifted_box.append([pt[0] + off_x, pt[1] + off_y])
all_results.append({
"text": txt,
"confidence": float(score),
"bbox": shifted_box
})
tile_lines += 1
print(f"{tile_lines} lines")
# Дедупликация: если два bbox пересекаются (IoU > 0.5) — оставляем тот, что с higher confidence
print(f"[INFO] Дедупликация {len(all_results)} строк...")
unique = []
for r in sorted(all_results, key=lambda x: -x["confidence"]):
is_dup = False
for u in unique:
if iou_bbox(r["bbox"], u["bbox"]) > 0.5:
is_dup = True
break
if not is_dup:
unique.append(r)
print(f"[OK] Уникальных строк: {len(unique)}")
return unique
def main():
if len(sys.argv) < 2:
print("Usage: python tiling_ocr.py <png> [tile_size] [overlap]")
sys.exit(1)
png_path = Path(sys.argv[1])
tile_size = int(sys.argv[2]) if len(sys.argv) > 2 else 2000
overlap = int(sys.argv[3]) if len(sys.argv) > 3 else 200
results = run_tiling_ocr(png_path, tile_size, overlap)
# Сохранить результаты
out_json = png_path.parent / f"{png_path.stem}_tiling_ocr.json"
with open(out_json, "w", encoding="utf-8") as f:
json.dump({
"source": str(png_path),
"tile_size": tile_size,
"overlap": overlap,
"total_lines": len(results),
"lines": results
}, f, ensure_ascii=False, indent=2)
print(f"[OK] Сохранено: {out_json}")
# Вывести числа
nums = [r for r in results if re.match(r'^\d+([,.]\d+)?$', r["text"].strip())]
print(f"\nНайдено {len(nums)} чисел:")
for n in sorted(nums, key=lambda x: x["bbox"][0][1]):
bbox = n["bbox"]
cx = sum(p[0] for p in bbox) / len(bbox)
cy = sum(p[1] for p in bbox) / len(bbox)
print(f" {n['text']:>10} x={cx:>8.0f} y={cy:>8.0f} conf={n['confidence']:.2f}")
if __name__ == "__main__":
main()