opencode/process_any_pdf.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Универсальное распознавание PDF в указанную папку.
Поддерживает:
  - RapidOCR (локально, быстро)
  - RapidOCR + tiling (для больших чертежей)
  - qwen-vl-ocr (API, точнее)

Использование:
    python process_any_pdf.py <pdf_file> <output_folder> [--use-qwen] [--use-tiling]
"""

import sys
import json
import re
import fitz
from pathlib import Path
from PIL import Image
from rapidocr_onnxruntime import RapidOCR

# ------------------------------------------------------------------
# Параметры
# ------------------------------------------------------------------
DPI = 300
BATCH_SIZE = 5
TILE_SIZE = 2000
TILE_OVERLAP = 200

engine = RapidOCR()

# qwen-vl-ocr lazy import
try:
    from ocr_qwen import run_ocr as qwen_ocr
    QWEN_AVAILABLE = True
except ImportError:
    QWEN_AVAILABLE = False


# ------------------------------------------------------------------
# Tiling OCR helpers
# ------------------------------------------------------------------
def _make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200):
    w, h = img.size
    tiles = []
    step = tile_size - overlap
    for y in range(0, h, step):
        for x in range(0, w, step):
            x2 = min(x + tile_size, w)
            y2 = min(y + tile_size, h)
            tiles.append((x, y, img.crop((x, y, x2, y2))))
    return tiles


def _bbox_iou(a, b):
    def _rect(box):
        if isinstance(box[0], list):
            xs = [p[0] for p in box]
            ys = [p[1] for p in box]
            return min(xs), min(ys), max(xs), max(ys)
        return box[0], box[1], box[2], box[3]

    ax1, ay1, ax2, ay2 = _rect(a)
    bx1, by1, bx2, by2 = _rect(b)
    ix1, iy1 = max(ax1, bx1), max(ay1, by1)
    ix2, iy2 = min(ax2, bx2), min(ay2, by2)
    if ix2 <= ix1 or iy2 <= iy1:
        return 0.0
    inter = (ix2 - ix1) * (iy2 - iy1)
    union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
    return inter / union if union > 0 else 0.0


def run_tiling_ocr(img_path: Path, conf_threshold: float = 0.5):
    """Запускает RapidOCR по кропам и объединяет результаты."""
    img = Image.open(img_path)
    tiles = _make_tiles(img, TILE_SIZE, TILE_OVERLAP)
    all_results = []
    for off_x, off_y, crop in tiles:
        tmp = f"/tmp/tile_ocr.png"
        crop.save(tmp)
        res = engine(tmp)
        if res and res[0]:
            for item in res[0]:
                box, txt, score = item
                if score < conf_threshold:
                    continue
                shifted = [[pt[0] + off_x, pt[1] + off_y] for pt in box]
                all_results.append({"text": txt, "confidence": float(score), "bbox": shifted})

    # Дедупликация по IoU
    unique = []
    for r in sorted(all_results, key=lambda x: -x["confidence"]):
        is_dup = any(_bbox_iou(r["bbox"], u["bbox"]) > 0.5 for u in unique)
        if not is_dup:
            unique.append(r)
    return unique

# ------------------------------------------------------------------
def process_pdf(pdf_path: Path, out_dir: Path, use_qwen: bool = False, use_tiling: bool = False):
    out_dir.mkdir(parents=True, exist_ok=True)
    doc = fitz.open(pdf_path)
    total = len(doc)
    print(f"=== PDF: {pdf_path.name} | Страниц: {total} -> {out_dir} ===")
    if use_qwen:
        print(f"[INFO] OCR engine: qwen-vl-ocr (API)")
    elif use_tiling:
        print(f"[INFO] OCR engine: RapidOCR + tiling ({TILE_SIZE}px tiles)")
    else:
        print(f"[INFO] OCR engine: RapidOCR (local)")
    print()
    
    all_pages = []
    for i in range(total):
        print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ", flush=True)
        page = doc.load_page(i)
        raw_text = page.get_text("text").strip()
        
        mat = fitz.Matrix(DPI / 72, DPI / 72)
        pix = page.get_pixmap(matrix=mat)
        img_path = out_dir / f"page_{i+1:03d}.png"
        pix.save(img_path)
        
        # Выбор OCR engine
        if use_qwen and QWEN_AVAILABLE:
            try:
                ocr_lines = qwen_ocr(img_path, verbose=False)
                print(f"qwen-ocr строк: {len(ocr_lines)}")
            except Exception as e:
                print(f"qwen-ocr ERR: {e}, fallback to RapidOCR")
                ocr_lines = _run_rapidocr(img_path)
                print(f"RapidOCR строк: {len(ocr_lines)}")
        elif use_tiling:
            ocr_lines = run_tiling_ocr(img_path)
            print(f"Tiling OCR строк: {len(ocr_lines)}")
        else:
            ocr_lines = _run_rapidocr(img_path)
            print(f"RapidOCR строк: {len(ocr_lines)}")
        
        all_pages.append({
            "page_number": i + 1,
            "image": str(img_path.name),
            "pdf_text_layer": raw_text,
            "ocr_lines": ocr_lines,
            "ocr_line_count": len(ocr_lines)
        })
        
        if (i + 1) % BATCH_SIZE == 0 or i == total - 1:
            with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:
                json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)
            print(f"  -> сохранено ({i+1} страниц)")
    
    doc.close()
    print(f"\n=== Готово. Результат в {out_dir} ===")


def _run_rapidocr(img_path: Path):
    res = engine(img_path)
    ocr_lines = []
    if res and res[0] is not None:
        for item in res[0]:
            box, txt, score = item
            ocr_lines.append({
                "text": txt,
                "confidence": float(score),
                "bbox": box
            })
    return ocr_lines

# ------------------------------------------------------------------
def main():
    use_qwen = "--use-qwen" in sys.argv
    use_tiling = "--use-tiling" in sys.argv
    if use_qwen:
        sys.argv.remove("--use-qwen")
    if use_tiling:
        sys.argv.remove("--use-tiling")
    
    if len(sys.argv) < 2:
        pdf_file = "123.pdf"
        out_name = "output_123"
    else:
        pdf_file = sys.argv[1]
        out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"
    
    pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file
    out_dir = Path(out_name)
    
    if not pdf_path.exists():
        print(f"[ERR] Файл не найден: {pdf_path}")
        sys.exit(1)
    
    process_pdf(pdf_path, out_dir, use_qwen=use_qwen, use_tiling=use_tiling)

if __name__ == "__main__":
    main()
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`"""`
			`Универсальное распознавание PDF в указанную папку.`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`Поддерживает:`
			`- RapidOCR (локально, быстро)`
			`- RapidOCR + tiling (для больших чертежей)`
			`- qwen-vl-ocr (API, точнее)`

Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`Использование:`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`python process_any_pdf.py <pdf_file> <output_folder> [--use-qwen] [--use-tiling]`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`"""`

			`import sys`
			`import json`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`import re`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`import fitz`
			`from pathlib import Path`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`from PIL import Image`
Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`from rapidocr_onnxruntime import RapidOCR`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`# ------------------------------------------------------------------`
			`# Параметры`
			`# ------------------------------------------------------------------`
			`DPI = 300`
			`BATCH_SIZE = 5`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`TILE_SIZE = 2000`
			`TILE_OVERLAP = 200`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`engine = RapidOCR()`

Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`# qwen-vl-ocr lazy import`
			`try:`
			`from ocr_qwen import run_ocr as qwen_ocr`
			`QWEN_AVAILABLE = True`
			`except ImportError:`
			`QWEN_AVAILABLE = False`


			`# ------------------------------------------------------------------`
			`# Tiling OCR helpers`
			`# ------------------------------------------------------------------`
			`def _make_tiles(img: Image.Image, tile_size: int = 2000, overlap: int = 200):`
			`w, h = img.size`
			`tiles = []`
			`step = tile_size - overlap`
			`for y in range(0, h, step):`
			`for x in range(0, w, step):`
			`x2 = min(x + tile_size, w)`
			`y2 = min(y + tile_size, h)`
			`tiles.append((x, y, img.crop((x, y, x2, y2))))`
			`return tiles`


			`def _bbox_iou(a, b):`
			`def _rect(box):`
			`if isinstance(box[0], list):`
			`xs = [p[0] for p in box]`
			`ys = [p[1] for p in box]`
			`return min(xs), min(ys), max(xs), max(ys)`
			`return box[0], box[1], box[2], box[3]`

			`ax1, ay1, ax2, ay2 = _rect(a)`
			`bx1, by1, bx2, by2 = _rect(b)`
			`ix1, iy1 = max(ax1, bx1), max(ay1, by1)`
			`ix2, iy2 = min(ax2, bx2), min(ay2, by2)`
			`if ix2 <= ix1 or iy2 <= iy1:`
			`return 0.0`
			`inter = (ix2 - ix1) * (iy2 - iy1)`
			`union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter`
			`return inter / union if union > 0 else 0.0`


			`def run_tiling_ocr(img_path: Path, conf_threshold: float = 0.5):`
			`"""Запускает RapidOCR по кропам и объединяет результаты."""`
			`img = Image.open(img_path)`
			`tiles = _make_tiles(img, TILE_SIZE, TILE_OVERLAP)`
			`all_results = []`
			`for off_x, off_y, crop in tiles:`
			`tmp = f"/tmp/tile_ocr.png"`
			`crop.save(tmp)`
			`res = engine(tmp)`
			`if res and res[0]:`
			`for item in res[0]:`
			`box, txt, score = item`
			`if score < conf_threshold:`
			`continue`
			`shifted = [[pt[0] + off_x, pt[1] + off_y] for pt in box]`
			`all_results.append({"text": txt, "confidence": float(score), "bbox": shifted})`

			`# Дедупликация по IoU`
			`unique = []`
			`for r in sorted(all_results, key=lambda x: -x["confidence"]):`
			`is_dup = any(_bbox_iou(r["bbox"], u["bbox"]) > 0.5 for u in unique)`
			`if not is_dup:`
			`unique.append(r)`
			`return unique`

Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`# ------------------------------------------------------------------`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`def process_pdf(pdf_path: Path, out_dir: Path, use_qwen: bool = False, use_tiling: bool = False):`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`out_dir.mkdir(parents=True, exist_ok=True)`
			`doc = fitz.open(pdf_path)`
			`total = len(doc)`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`print(f"=== PDF: {pdf_path.name} \| Страниц: {total} -> {out_dir} ===")`
			`if use_qwen:`
			`print(f"[INFO] OCR engine: qwen-vl-ocr (API)")`
			`elif use_tiling:`
			`print(f"[INFO] OCR engine: RapidOCR + tiling ({TILE_SIZE}px tiles)")`
			`else:`
			`print(f"[INFO] OCR engine: RapidOCR (local)")`
			`print()`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`all_pages = []`
			`for i in range(total):`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`print(f"[{i+1}/{total}] Рендер + OCR ...", end=" ", flush=True)`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`page = doc.load_page(i)`
			`raw_text = page.get_text("text").strip()`

			`mat = fitz.Matrix(DPI / 72, DPI / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_path = out_dir / f"page_{i+1:03d}.png"`
			`pix.save(img_path)`

Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`# Выбор OCR engine`
			`if use_qwen and QWEN_AVAILABLE:`
			`try:`
			`ocr_lines = qwen_ocr(img_path, verbose=False)`
			`print(f"qwen-ocr строк: {len(ocr_lines)}")`
			`except Exception as e:`
			`print(f"qwen-ocr ERR: {e}, fallback to RapidOCR")`
			`ocr_lines = _run_rapidocr(img_path)`
			`print(f"RapidOCR строк: {len(ocr_lines)}")`
			`elif use_tiling:`
			`ocr_lines = run_tiling_ocr(img_path)`
			`print(f"Tiling OCR строк: {len(ocr_lines)}")`
			`else:`
			`ocr_lines = _run_rapidocr(img_path)`
			`print(f"RapidOCR строк: {len(ocr_lines)}")`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`all_pages.append({`
			`"page_number": i + 1,`
			`"image": str(img_path.name),`
			`"pdf_text_layer": raw_text,`
			`"ocr_lines": ocr_lines,`
			`"ocr_line_count": len(ocr_lines)`
			`})`

			`if (i + 1) % BATCH_SIZE == 0 or i == total - 1:`
			`with open(out_dir / "full_ocr_results.json", "w", encoding="utf-8") as f:`
			`json.dump({"pages": all_pages}, f, ensure_ascii=False, indent=2)`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`print(f" -> сохранено ({i+1} страниц)")`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`doc.close()`
			`print(f"\n=== Готово. Результат в {out_dir} ===")`

Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00
			`def _run_rapidocr(img_path: Path):`
			`res = engine(img_path)`
			`ocr_lines = []`
			`if res and res[0] is not None:`
			`for item in res[0]:`
			`box, txt, score = item`
			`ocr_lines.append({`
			`"text": txt,`
			`"confidence": float(score),`
			`"bbox": box`
			`})`
			`return ocr_lines`

Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`# ------------------------------------------------------------------`
			`def main():`
Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`use_qwen = "--use-qwen" in sys.argv`
			`use_tiling = "--use-tiling" in sys.argv`
			`if use_qwen:`
			`sys.argv.remove("--use-qwen")`
			`if use_tiling:`
			`sys.argv.remove("--use-tiling")`

Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00			`if len(sys.argv) < 2:`
			`pdf_file = "123.pdf"`
			`out_name = "output_123"`
			`else:`
			`pdf_file = sys.argv[1]`
			`out_name = sys.argv[2] if len(sys.argv) > 2 else f"output_{Path(pdf_file).stem}"`

Add RAG pipeline: LightRAG indexer, OpenCode API, VLM describer, and test tools - Add rag_indexer.py: build LightRAG index from OCR with OpenCode API - Add rag_query.py: query the knowledge graph - Add vlm_describer.py: generate VLM descriptions via LM Studio - Add test_model.py: quick check for LightRAG-compatible models - Add run_pipeline.sh and run_pipeline.bat: full OCR → VLM → RAG pipeline - Fix rapidocr import (rapidocr_onnxruntime) - Fix process_any_pdf.py paths for cross-platform use - Add .env.example, README_RAG.md, AGENTS.md - Update .gitignore for outputs and secrets 2026-05-29 06:54:37 +00:00			`pdf_path = Path(pdf_file) if Path(pdf_file).exists() else Path(".") / pdf_file`
			`out_dir = Path(out_name)`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`if not pdf_path.exists():`
			`print(f"[ERR] Файл не найден: {pdf_path}")`
			`sys.exit(1)`

Add tiling OCR, preprocess and visualization tools - tiling_ocr.py: split large drawings into overlapping tiles for better small-text recognition - preprocess_for_ocr.py: CLAHE + unsharp mask for enhancing blueprint contrast - visualize_dimensions.py: draw bounding boxes around detected dimension numbers - compare_ocr.py: side-by-side visualization of normal vs tiling OCR results - dimension_extractor.py: line-based dimension detection with pixel verification - ocr_qwen.py: Alibaba Cloud qwen-vl-ocr client with resize and regex fallback parser - test_qwen_ocr.py: standalone test for qwen OCR - process_any_pdf.py: add --use-tiling flag to switch between normal and tiling OCR 2026-06-01 09:29:26 +00:00			`process_pdf(pdf_path, out_dir, use_qwen=use_qwen, use_tiling=use_tiling)`
Add PDF OCR pipeline and project indexes for Кронштадтский and 123 2026-05-28 22:04:01 +00:00
			`if __name__ == "__main__":`
			`main()`