opencode/tiling_ocr.py

158 lines
5.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Tiling OCR для больших чертежей.
Разрезает PNG на перекрывающиеся кропы, прогоняет OCR на каждом,
объединяет результаты с дедупликацией.
Эффект: каждый кроп масштабирован "крупнее" для OCR мелкий текст
находится на бОльшем % площади кропа.
"""
import sys
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple
from PIL import Image
from rapidocr import RapidOCR
def make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200) -> List[Tuple[int, int, Image.Image]]:
"""
Генерирует кропы с перекрытием.
Возвращает: [(offset_x, offset_y, cropped_image), ...]
"""
w, h = img.size
tiles = []
step = tile_size - overlap
for y in range(0, h, step):
for x in range(0, w, step):
x2 = min(x + tile_size, w)
y2 = min(y + tile_size, h)
crop = img.crop((x, y, x2, y2))
tiles.append((x, y, crop))
return tiles
def iou_bbox(a: List, b: List) -> float:
"""IoU двух bbox в формате [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]."""
def _get_rect(box):
if isinstance(box[0], list):
xs = [p[0] for p in box]
ys = [p[1] for p in box]
return min(xs), min(ys), max(xs), max(ys)
else:
return box[0], box[1], box[2], box[3]
ax1, ay1, ax2, ay2 = _get_rect(a)
bx1, by1, bx2, by2 = _get_rect(b)
ix1 = max(ax1, bx1)
iy1 = max(ay1, by1)
ix2 = min(ax2, bx2)
iy2 = min(ay2, by2)
if ix2 <= ix1 or iy2 <= iy1:
return 0.0
inter = (ix2 - ix1) * (iy2 - iy1)
area_a = (ax2 - ax1) * (ay2 - ay1)
area_b = (bx2 - bx1) * (by2 - by1)
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def run_tiling_ocr(png_path: Path, tile_size: int = 2000, overlap: int = 200, conf_threshold: float = 0.5):
"""Основная функция."""
print(f"[INFO] Загрузка {png_path.name}...")
img = Image.open(png_path)
print(f"[INFO] Размер: {img.size}")
tiles = make_tiles(img, tile_size, overlap)
print(f"[INFO] Кропов: {len(tiles)}")
engine = RapidOCR()
all_results = []
for i, (off_x, off_y, crop) in enumerate(tiles, 1):
# Временно сохранить кроп
tmp_path = f"/tmp/tile_{i:03d}.png"
crop.save(tmp_path)
print(f" [{i}/{len(tiles)}] tile @ ({off_x}, {off_y}) size {crop.size} ...", end=" ", flush=True)
res = engine(tmp_path)
tile_lines = 0
if res and res[0]:
for item in res[0]:
box, txt, score = item
if score < conf_threshold:
continue
# Сдвинуть bbox на offset кропа
shifted_box = []
for pt in box:
shifted_box.append([pt[0] + off_x, pt[1] + off_y])
all_results.append({
"text": txt,
"confidence": float(score),
"bbox": shifted_box
})
tile_lines += 1
print(f"{tile_lines} lines")
# Дедупликация: если два bbox пересекаются (IoU > 0.5) — оставляем тот, что с higher confidence
print(f"[INFO] Дедупликация {len(all_results)} строк...")
unique = []
for r in sorted(all_results, key=lambda x: -x["confidence"]):
is_dup = False
for u in unique:
if iou_bbox(r["bbox"], u["bbox"]) > 0.5:
is_dup = True
break
if not is_dup:
unique.append(r)
print(f"[OK] Уникальных строк: {len(unique)}")
return unique
def main():
if len(sys.argv) < 2:
print("Usage: python tiling_ocr.py <png> [tile_size] [overlap]")
sys.exit(1)
png_path = Path(sys.argv[1])
tile_size = int(sys.argv[2]) if len(sys.argv) > 2 else 2000
overlap = int(sys.argv[3]) if len(sys.argv) > 3 else 200
results = run_tiling_ocr(png_path, tile_size, overlap)
# Сохранить результаты
out_json = png_path.parent / f"{png_path.stem}_tiling_ocr.json"
with open(out_json, "w", encoding="utf-8") as f:
json.dump({
"source": str(png_path),
"tile_size": tile_size,
"overlap": overlap,
"total_lines": len(results),
"lines": results
}, f, ensure_ascii=False, indent=2)
print(f"[OK] Сохранено: {out_json}")
# Вывести числа
nums = [r for r in results if re.match(r'^\d+([,.]\d+)?$', r["text"].strip())]
print(f"\nНайдено {len(nums)} чисел:")
for n in sorted(nums, key=lambda x: x["bbox"][0][1]):
bbox = n["bbox"]
cx = sum(p[0] for p in bbox) / len(bbox)
cy = sum(p[1] for p in bbox) / len(bbox)
print(f" {n['text']:>10} x={cx:>8.0f} y={cy:>8.0f} conf={n['confidence']:.2f}")
if __name__ == "__main__":
main()