transcription/tests/test_ingest.py

"""Tests for document ingestion pipeline."""

import json
import sys
import tempfile
import unittest
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from src.ingest.classify import _parse_json
from src.ingest.formatter import format_index_document
from src.ingest.models import NormalizedDocument, DocumentChunk
from src.ingest.router import extract_document, is_audio_file, is_document_file, resolve_upload_kind


def test_is_audio_file():
    assert is_audio_file("meeting.mp4") is True
    assert is_audio_file("notes.pdf") is False


def test_is_document_file():
    assert is_document_file("spec.pdf") is True
    assert is_document_file("audio.wav") is False
    assert is_document_file("data.xlsx") is True


def test_extract_text_md():
    with tempfile.TemporaryDirectory() as tmp:
        md = Path(tmp) / "note.md"
        md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")
        doc = extract_document(md, "test-project", "specification")
        assert doc.full_text
        assert "Текст документа" in doc.full_text
        assert doc.project == "test-project"
        assert doc.doc_type == "specification"


def test_extract_csv():
    with tempfile.TemporaryDirectory() as tmp:
        csv_path = Path(tmp) / "data.csv"
        csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")
        doc = extract_document(csv_path, "gp-2026", "estimate")
        assert "col1" in doc.full_text


def test_format_index_document():
    doc = NormalizedDocument(
        document_id="doc_test",
        filename="test.md",
        doc_type="report",
        project="2026",
        full_text="Содержание отчёта",
        chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],
    )
    metadata = {
        "title": "Отчёт Q1",
        "topic": "Финансы",
        "summary": "Кратко",
        "key_decisions": ["Утвердить бюджет"],
    }
    text = format_index_document(doc, metadata)
    assert "Отчёт Q1" in text
    assert "Утвердить бюджет" in text
    assert "Содержание отчёта" in text


def test_parse_classify_json():
    raw = """```json
    {"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}
    ```"""
    meta = _parse_json(raw, "2026", "other")
    assert meta["doc_type"] == "contract"
    assert meta["title"] == "Договор"


def test_resolve_upload_kind():
    assert resolve_upload_kind("spec.pdf") == "document"
    assert resolve_upload_kind("call.mp3") == "audio"
    try:
        resolve_upload_kind("image.png")
        assert False, "expected ValueError"
    except ValueError:
        pass


if __name__ == "__main__":
    class IngestTestCase(unittest.TestCase):
        def test_audio(self):
            test_is_audio_file()

        def test_document(self):
            test_is_document_file()

        def test_parse(self):
            test_parse_classify_json()

        def test_format(self):
            test_format_index_document()

        def test_extract_md(self):
            test_extract_text_md()

        def test_extract_csv(self):
            test_extract_csv()

        def test_route(self):
            test_resolve_upload_kind()

    unittest.main(verbosity=2)