236 lines
9.8 KiB
Python
236 lines
9.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Валидатор ГОСТ-ов и размеров на чертежах.
|
|||
|
|
|
|||
|
|
Проверяет OCR-результаты на:
|
|||
|
|
1. Найденные ГОСТ/СНиП/СП/ТУ — сверка с базой устаревших
|
|||
|
|
2. Размеры — валидация по типовым модулям и суммам
|
|||
|
|
3. Низкий confidence OCR — флаги для ручной проверки
|
|||
|
|
|
|||
|
|
Использование:
|
|||
|
|
python gost_dimension_validator.py <output_folder>
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Tuple
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# База устаревших ГОСТов (пример — расширяется)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
GOST_DATABASE = {
|
|||
|
|
# Устаревшие ГОСТы → замена
|
|||
|
|
"ГОСТ 21.101-97": {"status": "active", "name": "Система проектной документации"},
|
|||
|
|
"ГОСТ 21.501-93": {"status": "obsolete", "replacement": "ГОСТ Р 21.1017-2022", "name": "Правила выполнения архитектурных чертежей"},
|
|||
|
|
"ГОСТ 2.301-68": {"status": "active", "name": "Форматы"},
|
|||
|
|
"ГОСТ 2.302-68": {"status": "obsolete", "replacement": "ГОСТ 2.302-2019", "name": "Масштабы"},
|
|||
|
|
"ГОСТ 2.303-68": {"status": "obsolete", "replacement": "ГОСТ 2.303-2020", "name": "Линии"},
|
|||
|
|
"ГОСТ 2.304-81": {"status": "obsolete", "replacement": "ГОСТ 2.304-2021", "name": "Шрифты чертежные"},
|
|||
|
|
"ГОСТ 2.305-2008": {"status": "active", "name": "Изображения виды"},
|
|||
|
|
"ГОСТ 2.307-2011": {"status": "active", "name": "Нанесение размеров"},
|
|||
|
|
"СНиП II-22-81": {"status": "obsolete", "replacement": "СП 70.13330.2012", "name": "Каменные и армокаменные конструкции"},
|
|||
|
|
"СНиП 2.01.07-85": {"status": "obsolete", "replacement": "СП 20.13330.2016", "name": "Нагрузки и воздействия"},
|
|||
|
|
"СНиП 31-01-2003": {"status": "obsolete", "replacement": "СП 54.13330.2016", "name": "Жилые многоквартирные дома"},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Типовые строительные модули (мм)
|
|||
|
|
CONSTRUCTION_MODULES = [100, 200, 300, 400, 500, 600, 1000, 1200, 1500, 1800, 2400, 3000, 3600, 4200, 5400, 6000, 6600]
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Парсеры
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def extract_gosts(text: str) -> List[Tuple[str, int]]:
|
|||
|
|
"""Извлекает ГОСТ/СНиП/СП/ТУ из текста с позициями."""
|
|||
|
|
patterns = [
|
|||
|
|
r'ГОСТ\s*Р?\s*\d{1,5}(?:[-.]\d+)*(?:-\d{2,4})?', # ГОСТ 12345-67, ГОСТ Р 21.1017-2022
|
|||
|
|
r'СНиП\s*(?:[IVX]+[-.])?\s*\d{1,3}[-.]\d{1,3}[-.]?\d{0,4}', # СНиП II-22-81, СНиП 31-01-2003
|
|||
|
|
r'СП\s*\d{1,3}\.\d{1,6}\.\d{4}', # СП 54.13330.2016
|
|||
|
|
r'ТУ\s*\d{1,4}(?:[-/]\d+)*[-.]\d{4}', # ТУ 400-...
|
|||
|
|
]
|
|||
|
|
found = []
|
|||
|
|
for pat in patterns:
|
|||
|
|
for m in re.finditer(pat, text, re.I):
|
|||
|
|
found.append((m.group(0), m.start()))
|
|||
|
|
return found
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_dimensions(text: str) -> List[Tuple[str, float]]:
|
|||
|
|
"""Извлекает размеры в мм/м/см."""
|
|||
|
|
found = []
|
|||
|
|
# Основные размеры в мм (3600, 5400, 125.30)
|
|||
|
|
for m in re.finditer(r'\b(\d{1,5}(?:[.,]\d{1,2})?)\s*м?[мм]?\b', text):
|
|||
|
|
val = m.group(1).replace(',', '.')
|
|||
|
|
try:
|
|||
|
|
num = float(val)
|
|||
|
|
if 10 <= num <= 50000: # реалистичные строительные размеры
|
|||
|
|
found.append((m.group(0), num))
|
|||
|
|
except ValueError:
|
|||
|
|
pass
|
|||
|
|
return found
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_typical_module(dim: float, tolerance: float = 5.0) -> bool:
|
|||
|
|
"""Проверяет, кратен ли размер типовому модулю."""
|
|||
|
|
for mod in CONSTRUCTION_MODULES:
|
|||
|
|
if abs(dim - mod) < tolerance or abs(dim % mod) < tolerance:
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def validate_gost(gost: str) -> dict:
|
|||
|
|
"""Проверяет статус ГОСТа в базе."""
|
|||
|
|
gost_norm = gost.strip().upper()
|
|||
|
|
# Нормализация
|
|||
|
|
gost_norm = re.sub(r'\s+', ' ', gost_norm)
|
|||
|
|
|
|||
|
|
# Точное совпадение
|
|||
|
|
if gost_norm in GOST_DATABASE:
|
|||
|
|
info = GOST_DATABASE[gost_norm].copy()
|
|||
|
|
info["gost"] = gost
|
|||
|
|
return info
|
|||
|
|
|
|||
|
|
# Нечёткий поиск (без года)
|
|||
|
|
base = re.sub(r'-\d{2,4}$', '', gost_norm)
|
|||
|
|
for key, info in GOST_DATABASE.items():
|
|||
|
|
key_base = re.sub(r'-\d{2,4}$', '', key)
|
|||
|
|
if base == key_base:
|
|||
|
|
result = info.copy()
|
|||
|
|
result["gost"] = gost
|
|||
|
|
result["note"] = f"Найден по базовому номеру ({key})"
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
return {"gost": gost, "status": "unknown", "note": "Не найден в базе"}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Основная логика
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
def validate_folder(folder: Path):
|
|||
|
|
"""Проверяет OCR-данные из full_ocr_results.json."""
|
|||
|
|
ocr_path = folder / "full_ocr_results.json"
|
|||
|
|
if not ocr_path.exists():
|
|||
|
|
print(f"[ERR] Не найден {ocr_path}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
|||
|
|
pages = data["pages"]
|
|||
|
|
|
|||
|
|
print(f"[INFO] Проверка {len(pages)} страниц...\n")
|
|||
|
|
|
|||
|
|
all_gosts = []
|
|||
|
|
all_dims = []
|
|||
|
|
low_confidence_items = []
|
|||
|
|
|
|||
|
|
for page in pages:
|
|||
|
|
page_num = page["page_number"]
|
|||
|
|
|
|||
|
|
# --- 1. Проверка ГОСТ-ов ---
|
|||
|
|
full_text = page.get("pdf_text_layer", "")
|
|||
|
|
for line in page.get("ocr_lines", []):
|
|||
|
|
full_text += " " + line["text"]
|
|||
|
|
|
|||
|
|
gosts = extract_gosts(full_text)
|
|||
|
|
for gost, pos in gosts:
|
|||
|
|
info = validate_gost(gost)
|
|||
|
|
all_gosts.append({
|
|||
|
|
"page": page_num,
|
|||
|
|
"gost": gost,
|
|||
|
|
**info
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# --- 2. Проверка размеров ---
|
|||
|
|
dims = extract_dimensions(full_text)
|
|||
|
|
for dim_text, dim_val in dims:
|
|||
|
|
is_typical = is_typical_module(dim_val)
|
|||
|
|
all_dims.append({
|
|||
|
|
"page": page_num,
|
|||
|
|
"text": dim_text,
|
|||
|
|
"value": dim_val,
|
|||
|
|
"typical": is_typical,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# --- 3. Низкий confidence OCR ---
|
|||
|
|
for line in page.get("ocr_lines", []):
|
|||
|
|
conf = line.get("confidence", 0)
|
|||
|
|
if conf < 0.6:
|
|||
|
|
low_confidence_items.append({
|
|||
|
|
"page": page_num,
|
|||
|
|
"text": line["text"],
|
|||
|
|
"confidence": conf,
|
|||
|
|
"bbox": line.get("bbox", []),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# --- Вывод результатов ---
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("ГОСТ/СНиП/СП/ТУ:")
|
|||
|
|
print("=" * 60)
|
|||
|
|
obsolete = [g for g in all_gosts if g["status"] == "obsolete"]
|
|||
|
|
active = [g for g in all_gosts if g["status"] == "active"]
|
|||
|
|
unknown = [g for g in all_gosts if g["status"] == "unknown"]
|
|||
|
|
|
|||
|
|
if obsolete:
|
|||
|
|
print(f"\n⚠️ УСТАРЕВШИЕ ({len(obsolete)}):")
|
|||
|
|
for g in obsolete:
|
|||
|
|
print(f" Стр.{g['page']}: {g['gost']}")
|
|||
|
|
print(f" → Замена: {g.get('replacement', 'не указана')}")
|
|||
|
|
if active:
|
|||
|
|
print(f"\n✅ АКТУАЛЬНЫЕ ({len(active)}):")
|
|||
|
|
for g in active[:10]:
|
|||
|
|
print(f" Стр.{g['page']}: {g['gost']} ({g.get('name', '')})")
|
|||
|
|
if len(active) > 10:
|
|||
|
|
print(f" ... и ещё {len(active) - 10}")
|
|||
|
|
if unknown:
|
|||
|
|
print(f"\n❓ НЕИЗВЕСТНЫЕ ({len(unknown)}):")
|
|||
|
|
for g in unknown[:5]:
|
|||
|
|
print(f" Стр.{g['page']}: {g['gost']}")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print("РАЗМЕРЫ:")
|
|||
|
|
print("=" * 60)
|
|||
|
|
typical = [d for d in all_dims if d["typical"]]
|
|||
|
|
atypical = [d for d in all_dims if not d["typical"]]
|
|||
|
|
|
|||
|
|
print(f"\n✅ Типовые модули ({len(typical)}):")
|
|||
|
|
for d in typical[:10]:
|
|||
|
|
print(f" Стр.{d['page']}: {d['text']} → {d['value']} мм")
|
|||
|
|
|
|||
|
|
if atypical:
|
|||
|
|
print(f"\n⚠️ НЕТИПОВЫЕ/ПРОВЕРИТЬ ({len(atypical)}):")
|
|||
|
|
for d in atypical[:10]:
|
|||
|
|
print(f" Стр.{d['page']}: {d['text']} → {d['value']} мм (не кратен модулю)")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print("НИЗКИЙ CONFIDENCE OCR (< 0.6):")
|
|||
|
|
print("=" * 60)
|
|||
|
|
if low_confidence_items:
|
|||
|
|
print(f"\n⚠️ Найдено {len(low_confidence_items)} элементов для проверки:")
|
|||
|
|
for item in low_confidence_items[:15]:
|
|||
|
|
print(f" Стр.{item['page']}: '{item['text']}' (conf={item['confidence']:.2f})")
|
|||
|
|
if len(low_confidence_items) > 15:
|
|||
|
|
print(f" ... и ещё {len(low_confidence_items) - 15}")
|
|||
|
|
else:
|
|||
|
|
print("\n✅ Все элементы с высоким confidence")
|
|||
|
|
|
|||
|
|
# --- Сохранение JSON ---
|
|||
|
|
report = {
|
|||
|
|
"gosts": {"obsolete": obsolete, "active": active, "unknown": unknown},
|
|||
|
|
"dimensions": {"typical": typical, "atypical": atypical},
|
|||
|
|
"low_confidence": low_confidence_items,
|
|||
|
|
}
|
|||
|
|
out_path = folder / "validation_report.json"
|
|||
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(report, f, ensure_ascii=False, indent=2)
|
|||
|
|
print(f"\n[INFO] Отчёт сохранён: {out_path}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("output_123")
|
|||
|
|
validate_folder(folder)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|