- layout_detector.py: zone classification (drawing/table/title_block/notes) using line detection and text density analysis - multi_element_extractor.py: extract dimensions, positions (П-1, X-1), GOST refs, steel grades, elevations, beam labels per zone
184 lines
6.1 KiB
Python
184 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Multi-Element Extractor — извлечение разных типов элементов из чертежа.
|
||
|
||
Использует layout zones и OCR для извлечения:
|
||
- dimensions: размеры (числа рядом с линиями в зоне drawing)
|
||
- positions: позиции арматуры (П-1, X-1, etc.)
|
||
- gosts: ссылки на ГОСТ
|
||
- steel_grades: марки стали (A500C, B30, etc.)
|
||
- elevations: отметки уровней (-1.060, etc.)
|
||
- beam_labels: Балка Б-1, Б-2, Б-3
|
||
- table_data: структурированные таблицы (позиция → длина, масса, etc.)
|
||
"""
|
||
|
||
import sys
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import List, Dict
|
||
|
||
|
||
def extract_from_zone(ocr_lines: List[Dict], zone_type: str, zone_bbox: List[int]) -> Dict:
|
||
"""Извлекает элементы из конкретной зоны."""
|
||
results = {
|
||
"dimensions": [],
|
||
"positions": [],
|
||
"gosts": [],
|
||
"steel_grades": [],
|
||
"elevations": [],
|
||
"beam_labels": [],
|
||
"table_rows": []
|
||
}
|
||
|
||
x1, y1, x2, y2 = zone_bbox
|
||
zone_texts = [t for t in ocr_lines
|
||
if x1 <= t["cx"] <= x2 and y1 <= t["cy"] <= y2]
|
||
|
||
for t in zone_texts:
|
||
txt = t["text"].strip()
|
||
|
||
# ГОСТ
|
||
if re.search(r'ГОС\s*T?\s*\d+', txt):
|
||
results["gosts"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Марки стали
|
||
if re.search(r'A500C|B30|C\d+', txt, re.IGNORECASE):
|
||
results["steel_grades"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Балки
|
||
if re.match(r'Балка\s+Б-\d+', txt):
|
||
results["beam_labels"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Позиции (П-1, X-1, etc.)
|
||
if re.match(r'^[ПX]-\d+$', txt):
|
||
results["positions"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Отметки уровней
|
||
if re.match(r'^-?\d+[,.]\d+$', txt) and float(txt.replace(',', '.').replace('−', '-')) < 10:
|
||
results["elevations"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Размеры: только целые числа 2-4 цифры (исключаем мелкие фрагменты)
|
||
if zone_type == "drawing" and re.match(r'^\d{2,4}$', txt) and txt not in ('00', '000', '006'):
|
||
results["dimensions"].append({"text": txt, "bbox": t["bbox"]})
|
||
|
||
# Для таблиц: структурируем
|
||
if zone_type == "table":
|
||
results["table_rows"] = structure_table(zone_texts)
|
||
|
||
return results
|
||
|
||
|
||
def structure_table(zone_texts: List[Dict]) -> List[Dict]:
|
||
"""Простая структуризация таблицы: группировка по строкам (по Y)."""
|
||
if not zone_texts:
|
||
return []
|
||
|
||
# Сортируем по Y
|
||
sorted_texts = sorted(zone_texts, key=lambda t: t["cy"])
|
||
|
||
# Группируем по близости Y (±20px)
|
||
rows = []
|
||
current_row = []
|
||
last_y = None
|
||
for t in sorted_texts:
|
||
if last_y is None or abs(t["cy"] - last_y) < 20:
|
||
current_row.append(t)
|
||
else:
|
||
if current_row:
|
||
# Сортируем по X
|
||
current_row.sort(key=lambda x: x["cx"])
|
||
rows.append({"cells": [c["text"] for c in current_row]})
|
||
current_row = [t]
|
||
last_y = t["cy"]
|
||
|
||
if current_row:
|
||
current_row.sort(key=lambda x: x["cx"])
|
||
rows.append({"cells": [c["text"] for c in current_row]})
|
||
|
||
return rows
|
||
|
||
|
||
def extract_all_elements(png_path: Path, ocr_path: Path, layout_path: Path) -> Dict:
|
||
"""Извлекает все элементы по зонам."""
|
||
ocr = json.loads(ocr_path.read_text(encoding="utf-8"))
|
||
layout = json.loads(layout_path.read_text(encoding="utf-8"))
|
||
|
||
# Собрать все OCR lines с координатами
|
||
all_texts = []
|
||
for page in ocr.get("pages", []):
|
||
for line in page.get("ocr_lines", []):
|
||
bbox = line.get("bbox", [])
|
||
if not bbox:
|
||
continue
|
||
if isinstance(bbox[0], list):
|
||
xs = [p[0] for p in bbox]
|
||
ys = [p[1] for p in bbox]
|
||
else:
|
||
xs = [bbox[0], bbox[2]]
|
||
ys = [bbox[1], bbox[3]]
|
||
all_texts.append({
|
||
"text": line["text"],
|
||
"cx": sum(xs)/len(xs),
|
||
"cy": sum(ys)/len(ys),
|
||
"bbox": bbox
|
||
})
|
||
|
||
# Извлечь по зонам
|
||
all_results = {
|
||
"dimensions": [],
|
||
"positions": [],
|
||
"gosts": [],
|
||
"steel_grades": [],
|
||
"elevations": [],
|
||
"beam_labels": [],
|
||
"tables": []
|
||
}
|
||
|
||
for region in layout.get("regions", []):
|
||
zone_results = extract_from_zone(all_texts, region["type"], region["bbox"])
|
||
for key in all_results:
|
||
if key in zone_results:
|
||
all_results[key].extend(zone_results[key])
|
||
|
||
# Убрать дубликаты
|
||
for key in all_results:
|
||
seen = set()
|
||
unique = []
|
||
for item in all_results[key]:
|
||
if item["text"] not in seen:
|
||
seen.add(item["text"])
|
||
unique.append(item)
|
||
all_results[key] = unique
|
||
|
||
return all_results
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) < 4:
|
||
print("Usage: python multi_element_extractor.py <png> <ocr_json> <layout_json>")
|
||
sys.exit(1)
|
||
|
||
png = Path(sys.argv[1])
|
||
ocr = Path(sys.argv[2])
|
||
layout = Path(sys.argv[3])
|
||
|
||
results = extract_all_elements(png, ocr, layout)
|
||
|
||
out = png.parent / "elements.json"
|
||
with open(out, "w", encoding="utf-8") as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"[OK] Elements saved: {out}")
|
||
for key, items in results.items():
|
||
print(f" {key}: {len(items)} items")
|
||
for item in items[:5]:
|
||
print(f" {item['text']}")
|
||
if len(items) > 5:
|
||
print(f" ... and {len(items)-5} more")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|