184 lines
6.1 KiB
Python
184 lines
6.1 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Multi-Element Extractor — извлечение разных типов элементов из чертежа.
|
|||
|
|
|
|||
|
|
Использует layout zones и OCR для извлечения:
|
|||
|
|
- dimensions: размеры (числа рядом с линиями в зоне drawing)
|
|||
|
|
- positions: позиции арматуры (П-1, X-1, etc.)
|
|||
|
|
- gosts: ссылки на ГОСТ
|
|||
|
|
- steel_grades: марки стали (A500C, B30, etc.)
|
|||
|
|
- elevations: отметки уровней (-1.060, etc.)
|
|||
|
|
- beam_labels: Балка Б-1, Б-2, Б-3
|
|||
|
|
- table_data: структурированные таблицы (позиция → длина, масса, etc.)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import List, Dict
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_from_zone(ocr_lines: List[Dict], zone_type: str, zone_bbox: List[int]) -> Dict:
|
|||
|
|
"""Извлекает элементы из конкретной зоны."""
|
|||
|
|
results = {
|
|||
|
|
"dimensions": [],
|
|||
|
|
"positions": [],
|
|||
|
|
"gosts": [],
|
|||
|
|
"steel_grades": [],
|
|||
|
|
"elevations": [],
|
|||
|
|
"beam_labels": [],
|
|||
|
|
"table_rows": []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
x1, y1, x2, y2 = zone_bbox
|
|||
|
|
zone_texts = [t for t in ocr_lines
|
|||
|
|
if x1 <= t["cx"] <= x2 and y1 <= t["cy"] <= y2]
|
|||
|
|
|
|||
|
|
for t in zone_texts:
|
|||
|
|
txt = t["text"].strip()
|
|||
|
|
|
|||
|
|
# ГОСТ
|
|||
|
|
if re.search(r'ГОС\s*T?\s*\d+', txt):
|
|||
|
|
results["gosts"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Марки стали
|
|||
|
|
if re.search(r'A500C|B30|C\d+', txt, re.IGNORECASE):
|
|||
|
|
results["steel_grades"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Балки
|
|||
|
|
if re.match(r'Балка\s+Б-\d+', txt):
|
|||
|
|
results["beam_labels"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Позиции (П-1, X-1, etc.)
|
|||
|
|
if re.match(r'^[ПX]-\d+$', txt):
|
|||
|
|
results["positions"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Отметки уровней
|
|||
|
|
if re.match(r'^-?\d+[,.]\d+$', txt) and float(txt.replace(',', '.').replace('−', '-')) < 10:
|
|||
|
|
results["elevations"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Размеры: только целые числа 2-4 цифры (исключаем мелкие фрагменты)
|
|||
|
|
if zone_type == "drawing" and re.match(r'^\d{2,4}$', txt) and txt not in ('00', '000', '006'):
|
|||
|
|
results["dimensions"].append({"text": txt, "bbox": t["bbox"]})
|
|||
|
|
|
|||
|
|
# Для таблиц: структурируем
|
|||
|
|
if zone_type == "table":
|
|||
|
|
results["table_rows"] = structure_table(zone_texts)
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def structure_table(zone_texts: List[Dict]) -> List[Dict]:
|
|||
|
|
"""Простая структуризация таблицы: группировка по строкам (по Y)."""
|
|||
|
|
if not zone_texts:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Сортируем по Y
|
|||
|
|
sorted_texts = sorted(zone_texts, key=lambda t: t["cy"])
|
|||
|
|
|
|||
|
|
# Группируем по близости Y (±20px)
|
|||
|
|
rows = []
|
|||
|
|
current_row = []
|
|||
|
|
last_y = None
|
|||
|
|
for t in sorted_texts:
|
|||
|
|
if last_y is None or abs(t["cy"] - last_y) < 20:
|
|||
|
|
current_row.append(t)
|
|||
|
|
else:
|
|||
|
|
if current_row:
|
|||
|
|
# Сортируем по X
|
|||
|
|
current_row.sort(key=lambda x: x["cx"])
|
|||
|
|
rows.append({"cells": [c["text"] for c in current_row]})
|
|||
|
|
current_row = [t]
|
|||
|
|
last_y = t["cy"]
|
|||
|
|
|
|||
|
|
if current_row:
|
|||
|
|
current_row.sort(key=lambda x: x["cx"])
|
|||
|
|
rows.append({"cells": [c["text"] for c in current_row]})
|
|||
|
|
|
|||
|
|
return rows
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_all_elements(png_path: Path, ocr_path: Path, layout_path: Path) -> Dict:
|
|||
|
|
"""Извлекает все элементы по зонам."""
|
|||
|
|
ocr = json.loads(ocr_path.read_text(encoding="utf-8"))
|
|||
|
|
layout = json.loads(layout_path.read_text(encoding="utf-8"))
|
|||
|
|
|
|||
|
|
# Собрать все OCR lines с координатами
|
|||
|
|
all_texts = []
|
|||
|
|
for page in ocr.get("pages", []):
|
|||
|
|
for line in page.get("ocr_lines", []):
|
|||
|
|
bbox = line.get("bbox", [])
|
|||
|
|
if not bbox:
|
|||
|
|
continue
|
|||
|
|
if isinstance(bbox[0], list):
|
|||
|
|
xs = [p[0] for p in bbox]
|
|||
|
|
ys = [p[1] for p in bbox]
|
|||
|
|
else:
|
|||
|
|
xs = [bbox[0], bbox[2]]
|
|||
|
|
ys = [bbox[1], bbox[3]]
|
|||
|
|
all_texts.append({
|
|||
|
|
"text": line["text"],
|
|||
|
|
"cx": sum(xs)/len(xs),
|
|||
|
|
"cy": sum(ys)/len(ys),
|
|||
|
|
"bbox": bbox
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# Извлечь по зонам
|
|||
|
|
all_results = {
|
|||
|
|
"dimensions": [],
|
|||
|
|
"positions": [],
|
|||
|
|
"gosts": [],
|
|||
|
|
"steel_grades": [],
|
|||
|
|
"elevations": [],
|
|||
|
|
"beam_labels": [],
|
|||
|
|
"tables": []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for region in layout.get("regions", []):
|
|||
|
|
zone_results = extract_from_zone(all_texts, region["type"], region["bbox"])
|
|||
|
|
for key in all_results:
|
|||
|
|
if key in zone_results:
|
|||
|
|
all_results[key].extend(zone_results[key])
|
|||
|
|
|
|||
|
|
# Убрать дубликаты
|
|||
|
|
for key in all_results:
|
|||
|
|
seen = set()
|
|||
|
|
unique = []
|
|||
|
|
for item in all_results[key]:
|
|||
|
|
if item["text"] not in seen:
|
|||
|
|
seen.add(item["text"])
|
|||
|
|
unique.append(item)
|
|||
|
|
all_results[key] = unique
|
|||
|
|
|
|||
|
|
return all_results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
if len(sys.argv) < 4:
|
|||
|
|
print("Usage: python multi_element_extractor.py <png> <ocr_json> <layout_json>")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
png = Path(sys.argv[1])
|
|||
|
|
ocr = Path(sys.argv[2])
|
|||
|
|
layout = Path(sys.argv[3])
|
|||
|
|
|
|||
|
|
results = extract_all_elements(png, ocr, layout)
|
|||
|
|
|
|||
|
|
out = png.parent / "elements.json"
|
|||
|
|
with open(out, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"[OK] Elements saved: {out}")
|
|||
|
|
for key, items in results.items():
|
|||
|
|
print(f" {key}: {len(items)} items")
|
|||
|
|
for item in items[:5]:
|
|||
|
|
print(f" {item['text']}")
|
|||
|
|
if len(items) > 5:
|
|||
|
|
print(f" ... and {len(items)-5} more")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|