opencode/multi_element_extractor.py
Кирилл Блинов feeb02242b Add layout detection and multi-element extraction
- layout_detector.py: zone classification (drawing/table/title_block/notes) using line detection and text density analysis
- multi_element_extractor.py: extract dimensions, positions (П-1, X-1), GOST refs, steel grades, elevations, beam labels per zone
2026-06-01 12:29:32 +03:00

184 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Multi-Element Extractor — извлечение разных типов элементов из чертежа.
Использует layout zones и OCR для извлечения:
- dimensions: размеры (числа рядом с линиями в зоне drawing)
- positions: позиции арматуры (П-1, X-1, etc.)
- gosts: ссылки на ГОСТ
- steel_grades: марки стали (A500C, B30, etc.)
- elevations: отметки уровней (-1.060, etc.)
- beam_labels: Балка Б-1, Б-2, Б-3
- table_data: структурированные таблицы (позиция → длина, масса, etc.)
"""
import sys
import json
import re
from pathlib import Path
from typing import List, Dict
def extract_from_zone(ocr_lines: List[Dict], zone_type: str, zone_bbox: List[int]) -> Dict:
"""Извлекает элементы из конкретной зоны."""
results = {
"dimensions": [],
"positions": [],
"gosts": [],
"steel_grades": [],
"elevations": [],
"beam_labels": [],
"table_rows": []
}
x1, y1, x2, y2 = zone_bbox
zone_texts = [t for t in ocr_lines
if x1 <= t["cx"] <= x2 and y1 <= t["cy"] <= y2]
for t in zone_texts:
txt = t["text"].strip()
# ГОСТ
if re.search(r'ГОС\s*T?\s*\d+', txt):
results["gosts"].append({"text": txt, "bbox": t["bbox"]})
# Марки стали
if re.search(r'A500C|B30|C\d+', txt, re.IGNORECASE):
results["steel_grades"].append({"text": txt, "bbox": t["bbox"]})
# Балки
if re.match(r'Балка\s+Б-\d+', txt):
results["beam_labels"].append({"text": txt, "bbox": t["bbox"]})
# Позиции (П-1, X-1, etc.)
if re.match(r'^[ПX]-\d+$', txt):
results["positions"].append({"text": txt, "bbox": t["bbox"]})
# Отметки уровней
if re.match(r'^-?\d+[,.]\d+$', txt) and float(txt.replace(',', '.').replace('', '-')) < 10:
results["elevations"].append({"text": txt, "bbox": t["bbox"]})
# Размеры: только целые числа 2-4 цифры (исключаем мелкие фрагменты)
if zone_type == "drawing" and re.match(r'^\d{2,4}$', txt) and txt not in ('00', '000', '006'):
results["dimensions"].append({"text": txt, "bbox": t["bbox"]})
# Для таблиц: структурируем
if zone_type == "table":
results["table_rows"] = structure_table(zone_texts)
return results
def structure_table(zone_texts: List[Dict]) -> List[Dict]:
"""Простая структуризация таблицы: группировка по строкам (по Y)."""
if not zone_texts:
return []
# Сортируем по Y
sorted_texts = sorted(zone_texts, key=lambda t: t["cy"])
# Группируем по близости Y (±20px)
rows = []
current_row = []
last_y = None
for t in sorted_texts:
if last_y is None or abs(t["cy"] - last_y) < 20:
current_row.append(t)
else:
if current_row:
# Сортируем по X
current_row.sort(key=lambda x: x["cx"])
rows.append({"cells": [c["text"] for c in current_row]})
current_row = [t]
last_y = t["cy"]
if current_row:
current_row.sort(key=lambda x: x["cx"])
rows.append({"cells": [c["text"] for c in current_row]})
return rows
def extract_all_elements(png_path: Path, ocr_path: Path, layout_path: Path) -> Dict:
"""Извлекает все элементы по зонам."""
ocr = json.loads(ocr_path.read_text(encoding="utf-8"))
layout = json.loads(layout_path.read_text(encoding="utf-8"))
# Собрать все OCR lines с координатами
all_texts = []
for page in ocr.get("pages", []):
for line in page.get("ocr_lines", []):
bbox = line.get("bbox", [])
if not bbox:
continue
if isinstance(bbox[0], list):
xs = [p[0] for p in bbox]
ys = [p[1] for p in bbox]
else:
xs = [bbox[0], bbox[2]]
ys = [bbox[1], bbox[3]]
all_texts.append({
"text": line["text"],
"cx": sum(xs)/len(xs),
"cy": sum(ys)/len(ys),
"bbox": bbox
})
# Извлечь по зонам
all_results = {
"dimensions": [],
"positions": [],
"gosts": [],
"steel_grades": [],
"elevations": [],
"beam_labels": [],
"tables": []
}
for region in layout.get("regions", []):
zone_results = extract_from_zone(all_texts, region["type"], region["bbox"])
for key in all_results:
if key in zone_results:
all_results[key].extend(zone_results[key])
# Убрать дубликаты
for key in all_results:
seen = set()
unique = []
for item in all_results[key]:
if item["text"] not in seen:
seen.add(item["text"])
unique.append(item)
all_results[key] = unique
return all_results
def main():
if len(sys.argv) < 4:
print("Usage: python multi_element_extractor.py <png> <ocr_json> <layout_json>")
sys.exit(1)
png = Path(sys.argv[1])
ocr = Path(sys.argv[2])
layout = Path(sys.argv[3])
results = extract_all_elements(png, ocr, layout)
out = png.parent / "elements.json"
with open(out, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"[OK] Elements saved: {out}")
for key, items in results.items():
print(f" {key}: {len(items)} items")
for item in items[:5]:
print(f" {item['text']}")
if len(items) > 5:
print(f" ... and {len(items)-5} more")
if __name__ == "__main__":
main()