- tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR
197 lines
6.5 KiB
Python
197 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Универсальное распознавание PDF в указанную папку.
|
|
Поддерживает:
|
|
- RapidOCR (локально, быстро)
|
|
- RapidOCR + tiling (для больших чертежей)
|
|
- qwen-vl-ocr (API, точнее)
|
|
|
|
Использование:
|
|
python process_any_pdf.py <pdf_file> <output_folder> [--use-qwen] [--use-tiling]
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import re
|
|
import fitz
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
from rapidocr_onnxruntime import RapidOCR
|
|
|
|
# ------------------------------------------------------------------
|
|
# Параметры
|
|
# ------------------------------------------------------------------
|
|
DPI = 300
|
|
BATCH_SIZE = 5
|
|
TILE_SIZE = 2000
|
|
TILE_OVERLAP = 200
|
|
|
|
engine = RapidOCR()
|
|
|
|
# qwen-vl-ocr lazy import
|
|
try:
|
|
from ocr_qwen import run_ocr as qwen_ocr
|
|
QWEN_AVAILABLE = True
|
|
except ImportError:
|
|
QWEN_AVAILABLE = False
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tiling OCR helpers
|
|
# ------------------------------------------------------------------
|
|
def _make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200):
|
|
w, h = img.size
|
|
tiles = []
|
|
step = tile_size - overlap
|
|
for y in range(0, h, step):
|
|
for x in range(0, w, step):
|
|
x2 = min(x + tile_size, w)
|
|
y2 = min(y + tile_size, h)
|
|
tiles.append((x, y, img.crop((x, y, x2, y2))))
|
|
return tiles
|
|
|
|
|
|
def _bbox_iou(a, b):
|
|
def _rect(box):
|
|
if isinstance(box[0], list):
|
|
xs = [p[0] for p in box]
|
|
ys = [p[1] for p in box]
|
|
return min(xs), min(ys), max(xs), max(ys)
|
|
return box[0], box[1], box[2], box[3]
|
|
|
|
ax1, ay1, ax2, ay2 = _rect(a)
|
|
bx1, by1, bx2, by2 = _rect(b)
|
|
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
|
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
|
if ix2 <= ix1 or iy2 <= iy1:
|
|
return 0.0
|
|
inter = (ix2 - ix1) * (iy2 - iy1)
|
|
union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
|
|
return inter / union if union > 0 else 0.0
|
|
|
|
|
|
def run_tiling_ocr(img_path: Path, conf_threshold: float = 0.5):
|
|
"""Запускает RapidOCR по кропам и объединяет результаты."""
|
|
img = Image.open(img_path)
|
|
tiles = _make_tiles(img, TILE_SIZE, TILE_OVERLAP)
|
|
all_results = []
|
|
for off_x, off_y, crop in tiles:
|
|
tmp = f"/tmp/tile_ocr.png"
|
|
crop.save(tmp)
|
|
res = engine(tmp)
|
|
if res and res[0]:
|
|
for item in res[0]:
|
|
box, txt, score = item
|
|
if score < conf_threshold:
|
|
continue
|
|
shifted = [[pt[0] + off_x, pt[1] + off_y] for pt in box]
|
|
all_results.append({"text": txt, "confidence": float(score), "bbox": shifted})
|
|
|
|
# Дедупликация по IoU
|
|
unique = []
|
|
for r in sorted(all_results, key=lambda x: -x["confidence"]):
|
|
is_dup = any(_bbox_iou(r["bbox"], u["bbox"]) > 0.5 for u in unique)
|
|
if not is_dup:
|
|
unique.append(r)
|
|
return unique
|
|
|
|
# ------------------------------------------------------------------
|
|
def process_pdf(pdf_path: Path, out_dir: Path, use_qwen: bool = False, use_tiling: bool = False):
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
doc = fitz.open(pdf_path)
|
|
total = len(doc)
|
|
print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===")
|
|
if use_qwen:
|
|
print(f"[INFO] OCR engine: qwen-vl-ocr (API)")
|
|
elif use_tiling:
|
|
print(f"[INFO] OCR engine: RapidOCR + tiling ({TILE_SIZE}px tiles)")
|
|
else:
|
|
print(f"[INFO] OCR engine: RapidOCR (local)")
|
|
print()
|
|
|
|
all_pages = []
|
|
for i in range(total):
|
|
print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ", flush=True)
|
|
page = doc.load_page(i)
|
|
raw_text = page.get_text("text").strip()
|
|
|
|
mat = fitz.Matrix(DPI / 72, DPI / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_path = out_dir / f"page_{i+1:03d}.png"
|
|
pix.save(img_path)
|
|
|
|
# Выбор OCR engine
|
|
if use_qwen and QWEN_AVAILABLE:
|
|
try:
|
|
ocr_lines = qwen_ocr(img_path, verbose=False)
|
|
print(f"qwen-ocr строк: {len(ocr_lines)}")
|
|
except Exception as e:
|
|
print(f"qwen-ocr ERR: {e}, fallback to RapidOCR")
|
|
ocr_lines = _run_rapidocr(img_path)
|
|
print(f"RapidOCR строк: {len(ocr_lines)}")
|
|
elif use_tiling:
|
|
ocr_lines = run_tiling_ocr(img_path)
|
|
print(f"Tiling OCR строк: {len(ocr_lines)}")
|
|
else:
|
|
ocr_lines = _run_rapidocr(img_path)
|
|
print(f"RapidOCR строк: {len(ocr_lines)}")
|
|
|
|
all_pages.append({
|
|
"page_number": i + 1,
|
|
"image": str(img_path.name),
|
|
"pdf_text_layer": raw_text,
|
|
"ocr_lines": ocr_lines,
|
|
"ocr_line_count": len(ocr_lines)
|
|
})
|
|
|
|
if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
|
|
with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:
|
|
json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
|
|
print(f" -> сохранено ({i+1} страниц)")
|
|
|
|
doc.close()
|
|
print(f"\n=== Готово. Результат в {out_dir} ===")
|
|
|
|
|
|
def _run_rapidocr(img_path: Path):
|
|
res = engine(img_path)
|
|
ocr_lines = []
|
|
if res and res[0] is not None:
|
|
for item in res[0]:
|
|
box, txt, score = item
|
|
ocr_lines.append({
|
|
"text": txt,
|
|
"confidence": float(score),
|
|
"bbox": box
|
|
})
|
|
return ocr_lines
|
|
|
|
# ------------------------------------------------------------------
|
|
def main():
|
|
use_qwen = "--use-qwen" in sys.argv
|
|
use_tiling = "--use-tiling" in sys.argv
|
|
if use_qwen:
|
|
sys.argv.remove("--use-qwen")
|
|
if use_tiling:
|
|
sys.argv.remove("--use-tiling")
|
|
|
|
if len(sys.argv) < 2:
|
|
pdf_file = "123.pdf"
|
|
out_name = "output_123"
|
|
else:
|
|
pdf_file = sys.argv[1]
|
|
out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"
|
|
|
|
pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file
|
|
out_dir = Path(out_name)
|
|
|
|
if not pdf_path.exists():
|
|
print(f"[ERR] Файл не найден: {pdf_path}")
|
|
sys.exit(1)
|
|
|
|
process_pdf(pdf_path, out_dir, use_qwen=use_qwen, use_tiling=use_tiling)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|