"""Tests for document ingestion pipeline.""" import json import sys import tempfile import unittest from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) from src.ingest.classify import _parse_json from src.ingest.formatter import format_index_document from src.ingest.models import NormalizedDocument, DocumentChunk from src.ingest.router import extract_document, is_audio_file, is_document_file, resolve_upload_kind def test_is_audio_file(): assert is_audio_file("meeting.mp4") is True assert is_audio_file("notes.pdf") is False def test_is_document_file(): assert is_document_file("spec.pdf") is True assert is_document_file("audio.wav") is False assert is_document_file("data.xlsx") is True def test_extract_text_md(): with tempfile.TemporaryDirectory() as tmp: md = Path(tmp) / "note.md" md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8") doc = extract_document(md, "test-project", "specification") assert doc.full_text assert "Текст документа" in doc.full_text assert doc.project == "test-project" assert doc.doc_type == "specification" def test_extract_csv(): with tempfile.TemporaryDirectory() as tmp: csv_path = Path(tmp) / "data.csv" csv_path.write_text("col1,col2\na,b\n", encoding="utf-8") doc = extract_document(csv_path, "gp-2026", "estimate") assert "col1" in doc.full_text def test_format_index_document(): doc = NormalizedDocument( document_id="doc_test", filename="test.md", doc_type="report", project="2026", full_text="Содержание отчёта", chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")], ) metadata = { "title": "Отчёт Q1", "topic": "Финансы", "summary": "Кратко", "key_decisions": ["Утвердить бюджет"], } text = format_index_document(doc, metadata) assert "Отчёт Q1" in text assert "Утвердить бюджет" in text assert "Содержание отчёта" in text def test_parse_classify_json(): raw = """```json {"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"} ```""" meta = _parse_json(raw, "2026", "other") assert meta["doc_type"] == "contract" assert meta["title"] == "Договор" def test_resolve_upload_kind(): assert resolve_upload_kind("spec.pdf") == "document" assert resolve_upload_kind("call.mp3") == "audio" try: resolve_upload_kind("image.png") assert False, "expected ValueError" except ValueError: pass if __name__ == "__main__": class IngestTestCase(unittest.TestCase): def test_audio(self): test_is_audio_file() def test_document(self): test_is_document_file() def test_parse(self): test_parse_classify_json() def test_format(self): test_format_index_document() def test_extract_md(self): test_extract_text_md() def test_extract_csv(self): test_extract_csv() def test_route(self): test_resolve_upload_kind() unittest.main(verbosity=2)