- vlm_describer.py: objective extraction (beams, positions, GOSTs, dimensions) via qwen-vl-plus API. No error detection — only factual observation. - vlm_qc_checker.py: VLM-based QC (deprecated in favor of rules-only QC) - gost_dimension_validator.py: validate GOST references and dimension chains against known standards
236 lines
9.8 KiB
Python
236 lines
9.8 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Валидатор ГОСТ-ов и размеров на чертежах.
|
||
|
||
Проверяет OCR-результаты на:
|
||
1. Найденные ГОСТ/СНиП/СП/ТУ — сверка с базой устаревших
|
||
2. Размеры — валидация по типовым модулям и суммам
|
||
3. Низкий confidence OCR — флаги для ручной проверки
|
||
|
||
Использование:
|
||
python gost_dimension_validator.py <output_folder>
|
||
"""
|
||
|
||
import sys
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Dict, List, Tuple
|
||
|
||
# ------------------------------------------------------------------
|
||
# База устаревших ГОСТов (пример — расширяется)
|
||
# ------------------------------------------------------------------
|
||
GOST_DATABASE = {
|
||
# Устаревшие ГОСТы → замена
|
||
"ГОСТ 21.101-97": {"status": "active", "name": "Система проектной документации"},
|
||
"ГОСТ 21.501-93": {"status": "obsolete", "replacement": "ГОСТ Р 21.1017-2022", "name": "Правила выполнения архитектурных чертежей"},
|
||
"ГОСТ 2.301-68": {"status": "active", "name": "Форматы"},
|
||
"ГОСТ 2.302-68": {"status": "obsolete", "replacement": "ГОСТ 2.302-2019", "name": "Масштабы"},
|
||
"ГОСТ 2.303-68": {"status": "obsolete", "replacement": "ГОСТ 2.303-2020", "name": "Линии"},
|
||
"ГОСТ 2.304-81": {"status": "obsolete", "replacement": "ГОСТ 2.304-2021", "name": "Шрифты чертежные"},
|
||
"ГОСТ 2.305-2008": {"status": "active", "name": "Изображения виды"},
|
||
"ГОСТ 2.307-2011": {"status": "active", "name": "Нанесение размеров"},
|
||
"СНиП II-22-81": {"status": "obsolete", "replacement": "СП 70.13330.2012", "name": "Каменные и армокаменные конструкции"},
|
||
"СНиП 2.01.07-85": {"status": "obsolete", "replacement": "СП 20.13330.2016", "name": "Нагрузки и воздействия"},
|
||
"СНиП 31-01-2003": {"status": "obsolete", "replacement": "СП 54.13330.2016", "name": "Жилые многоквартирные дома"},
|
||
}
|
||
|
||
# Типовые строительные модули (мм)
|
||
CONSTRUCTION_MODULES = [100, 200, 300, 400, 500, 600, 1000, 1200, 1500, 1800, 2400, 3000, 3600, 4200, 5400, 6000, 6600]
|
||
|
||
# ------------------------------------------------------------------
|
||
# Парсеры
|
||
# ------------------------------------------------------------------
|
||
def extract_gosts(text: str) -> List[Tuple[str, int]]:
|
||
"""Извлекает ГОСТ/СНиП/СП/ТУ из текста с позициями."""
|
||
patterns = [
|
||
r'ГОСТ\s*Р?\s*\d{1,5}(?:[-.]\d+)*(?:-\d{2,4})?', # ГОСТ 12345-67, ГОСТ Р 21.1017-2022
|
||
r'СНиП\s*(?:[IVX]+[-.])?\s*\d{1,3}[-.]\d{1,3}[-.]?\d{0,4}', # СНиП II-22-81, СНиП 31-01-2003
|
||
r'СП\s*\d{1,3}\.\d{1,6}\.\d{4}', # СП 54.13330.2016
|
||
r'ТУ\s*\d{1,4}(?:[-/]\d+)*[-.]\d{4}', # ТУ 400-...
|
||
]
|
||
found = []
|
||
for pat in patterns:
|
||
for m in re.finditer(pat, text, re.I):
|
||
found.append((m.group(0), m.start()))
|
||
return found
|
||
|
||
|
||
def extract_dimensions(text: str) -> List[Tuple[str, float]]:
|
||
"""Извлекает размеры в мм/м/см."""
|
||
found = []
|
||
# Основные размеры в мм (3600, 5400, 125.30)
|
||
for m in re.finditer(r'\b(\d{1,5}(?:[.,]\d{1,2})?)\s*м?[мм]?\b', text):
|
||
val = m.group(1).replace(',', '.')
|
||
try:
|
||
num = float(val)
|
||
if 10 <= num <= 50000: # реалистичные строительные размеры
|
||
found.append((m.group(0), num))
|
||
except ValueError:
|
||
pass
|
||
return found
|
||
|
||
|
||
def is_typical_module(dim: float, tolerance: float = 5.0) -> bool:
|
||
"""Проверяет, кратен ли размер типовому модулю."""
|
||
for mod in CONSTRUCTION_MODULES:
|
||
if abs(dim - mod) < tolerance or abs(dim % mod) < tolerance:
|
||
return True
|
||
return False
|
||
|
||
|
||
def validate_gost(gost: str) -> dict:
|
||
"""Проверяет статус ГОСТа в базе."""
|
||
gost_norm = gost.strip().upper()
|
||
# Нормализация
|
||
gost_norm = re.sub(r'\s+', ' ', gost_norm)
|
||
|
||
# Точное совпадение
|
||
if gost_norm in GOST_DATABASE:
|
||
info = GOST_DATABASE[gost_norm].copy()
|
||
info["gost"] = gost
|
||
return info
|
||
|
||
# Нечёткий поиск (без года)
|
||
base = re.sub(r'-\d{2,4}$', '', gost_norm)
|
||
for key, info in GOST_DATABASE.items():
|
||
key_base = re.sub(r'-\d{2,4}$', '', key)
|
||
if base == key_base:
|
||
result = info.copy()
|
||
result["gost"] = gost
|
||
result["note"] = f"Найден по базовому номеру ({key})"
|
||
return result
|
||
|
||
return {"gost": gost, "status": "unknown", "note": "Не найден в базе"}
|
||
|
||
|
||
# ------------------------------------------------------------------
|
||
# Основная логика
|
||
# ------------------------------------------------------------------
|
||
def validate_folder(folder: Path):
|
||
"""Проверяет OCR-данные из full_ocr_results.json."""
|
||
ocr_path = folder / "full_ocr_results.json"
|
||
if not ocr_path.exists():
|
||
print(f"[ERR] Не найден {ocr_path}")
|
||
sys.exit(1)
|
||
|
||
data = json.loads(ocr_path.read_text(encoding="utf-8"))
|
||
pages = data["pages"]
|
||
|
||
print(f"[INFO] Проверка {len(pages)} страниц...\n")
|
||
|
||
all_gosts = []
|
||
all_dims = []
|
||
low_confidence_items = []
|
||
|
||
for page in pages:
|
||
page_num = page["page_number"]
|
||
|
||
# --- 1. Проверка ГОСТ-ов ---
|
||
full_text = page.get("pdf_text_layer", "")
|
||
for line in page.get("ocr_lines", []):
|
||
full_text += " " + line["text"]
|
||
|
||
gosts = extract_gosts(full_text)
|
||
for gost, pos in gosts:
|
||
info = validate_gost(gost)
|
||
all_gosts.append({
|
||
"page": page_num,
|
||
"gost": gost,
|
||
**info
|
||
})
|
||
|
||
# --- 2. Проверка размеров ---
|
||
dims = extract_dimensions(full_text)
|
||
for dim_text, dim_val in dims:
|
||
is_typical = is_typical_module(dim_val)
|
||
all_dims.append({
|
||
"page": page_num,
|
||
"text": dim_text,
|
||
"value": dim_val,
|
||
"typical": is_typical,
|
||
})
|
||
|
||
# --- 3. Низкий confidence OCR ---
|
||
for line in page.get("ocr_lines", []):
|
||
conf = line.get("confidence", 0)
|
||
if conf < 0.6:
|
||
low_confidence_items.append({
|
||
"page": page_num,
|
||
"text": line["text"],
|
||
"confidence": conf,
|
||
"bbox": line.get("bbox", []),
|
||
})
|
||
|
||
# --- Вывод результатов ---
|
||
print("=" * 60)
|
||
print("ГОСТ/СНиП/СП/ТУ:")
|
||
print("=" * 60)
|
||
obsolete = [g for g in all_gosts if g["status"] == "obsolete"]
|
||
active = [g for g in all_gosts if g["status"] == "active"]
|
||
unknown = [g for g in all_gosts if g["status"] == "unknown"]
|
||
|
||
if obsolete:
|
||
print(f"\n⚠️ УСТАРЕВШИЕ ({len(obsolete)}):")
|
||
for g in obsolete:
|
||
print(f" Стр.{g['page']}: {g['gost']}")
|
||
print(f" → Замена: {g.get('replacement', 'не указана')}")
|
||
if active:
|
||
print(f"\n✅ АКТУАЛЬНЫЕ ({len(active)}):")
|
||
for g in active[:10]:
|
||
print(f" Стр.{g['page']}: {g['gost']} ({g.get('name', '')})")
|
||
if len(active) > 10:
|
||
print(f" ... и ещё {len(active) - 10}")
|
||
if unknown:
|
||
print(f"\n❓ НЕИЗВЕСТНЫЕ ({len(unknown)}):")
|
||
for g in unknown[:5]:
|
||
print(f" Стр.{g['page']}: {g['gost']}")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("РАЗМЕРЫ:")
|
||
print("=" * 60)
|
||
typical = [d for d in all_dims if d["typical"]]
|
||
atypical = [d for d in all_dims if not d["typical"]]
|
||
|
||
print(f"\n✅ Типовые модули ({len(typical)}):")
|
||
for d in typical[:10]:
|
||
print(f" Стр.{d['page']}: {d['text']} → {d['value']} мм")
|
||
|
||
if atypical:
|
||
print(f"\n⚠️ НЕТИПОВЫЕ/ПРОВЕРИТЬ ({len(atypical)}):")
|
||
for d in atypical[:10]:
|
||
print(f" Стр.{d['page']}: {d['text']} → {d['value']} мм (не кратен модулю)")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("НИЗКИЙ CONFIDENCE OCR (< 0.6):")
|
||
print("=" * 60)
|
||
if low_confidence_items:
|
||
print(f"\n⚠️ Найдено {len(low_confidence_items)} элементов для проверки:")
|
||
for item in low_confidence_items[:15]:
|
||
print(f" Стр.{item['page']}: '{item['text']}' (conf={item['confidence']:.2f})")
|
||
if len(low_confidence_items) > 15:
|
||
print(f" ... и ещё {len(low_confidence_items) - 15}")
|
||
else:
|
||
print("\n✅ Все элементы с высоким confidence")
|
||
|
||
# --- Сохранение JSON ---
|
||
report = {
|
||
"gosts": {"obsolete": obsolete, "active": active, "unknown": unknown},
|
||
"dimensions": {"typical": typical, "atypical": atypical},
|
||
"low_confidence": low_confidence_items,
|
||
}
|
||
out_path = folder / "validation_report.json"
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||
print(f"\n[INFO] Отчёт сохранён: {out_path}")
|
||
|
||
|
||
def main():
|
||
folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("output_123")
|
||
validate_folder(folder)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|