transcription/tests/test_ingest.py

"""Tests for document ingestion pipeline."""

import json
import sys
import tempfile
import unittest
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from src.ingest.classify import _parse_json
from src.ingest.formatter import format_index_document
from src.ingest.models import NormalizedDocument, DocumentChunk
from src.ingest.router import (
    extract_document,
    is_audio_file,
    is_document_file,
    is_extractable,
    resolve_upload_kind,
)
from src.ingest.stub_writer import write_stub


def test_is_audio_file():
    assert is_audio_file("meeting.mp4") is True
    assert is_audio_file("notes.pdf") is False


def test_is_document_file():
    assert is_document_file("spec.pdf") is True
    assert is_document_file("audio.wav") is False
    assert is_document_file("data.xlsx") is True


def test_extract_text_md():
    with tempfile.TemporaryDirectory() as tmp:
        md = Path(tmp) / "note.md"
        md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")
        doc = extract_document(md, "test-project", "specification")
        assert doc.full_text
        assert "Текст документа" in doc.full_text
        assert doc.project == "test-project"
        assert doc.doc_type == "specification"


def test_extract_csv():
    with tempfile.TemporaryDirectory() as tmp:
        csv_path = Path(tmp) / "data.csv"
        csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")
        doc = extract_document(csv_path, "gp-2026", "estimate")
        assert "col1" in doc.full_text


def test_format_index_document():
    doc = NormalizedDocument(
        document_id="doc_test",
        filename="test.md",
        doc_type="report",
        project="2026",
        full_text="Содержание отчёта",
        chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],
    )
    metadata = {
        "title": "Отчёт Q1",
        "topic": "Финансы",
        "summary": "Кратко",
        "key_decisions": ["Утвердить бюджет"],
    }
    text = format_index_document(doc, metadata)
    assert "Отчёт Q1" in text
    assert "Утвердить бюджет" in text
    assert "Содержание отчёта" in text


def test_parse_classify_json():
    raw = """```json
    {"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}
    ```"""
    meta = _parse_json(raw, "2026", "other")
    assert meta["doc_type"] == "contract"
    assert meta["title"] == "Договор"


def test_resolve_upload_kind():
    assert resolve_upload_kind("spec.pdf") == "document"
    assert resolve_upload_kind("call.mp3") == "audio"
    try:
        resolve_upload_kind("image.png")
        assert False, "expected ValueError"
    except ValueError:
        pass


def test_is_extractable():
    assert is_extractable("spec.pdf") is True
    assert is_extractable("data.csv") is True
    assert is_extractable("notes.md") is True
    assert is_extractable("video.mp4") is False
    assert is_extractable("archive.zip") is False


def test_write_stub_for_binary():
    with tempfile.TemporaryDirectory() as tmp:
        mp4 = Path(tmp) / "movie.mp4"
        mp4.write_bytes(b"\x00" * 8)
        stub = write_stub(mp4, project="2026")
        assert stub.exists()
        text = stub.read_text(encoding="utf-8")
        assert "kind: video" in text
        assert "project: 2026" in text


if __name__ == "__main__":
    class IngestTestCase(unittest.TestCase):
        def test_audio(self):
            test_is_audio_file()

        def test_document(self):
            test_is_document_file()

        def test_parse(self):
            test_parse_classify_json()

        def test_format(self):
            test_format_index_document()

        def test_extract_md(self):
            test_extract_text_md()

        def test_extract_csv(self):
            test_extract_csv()

        def test_route(self):
            test_resolve_upload_kind()

        def test_extractable(self):
            test_is_extractable()

        def test_stub(self):
            test_write_stub_for_binary()

    unittest.main(verbosity=2)
Add document ingestion pipeline, chat analytics modes, and auth fixes Ingest MD/PDF/DOCX/XLSX into org-scoped documents with classify and RAG indexing. Add compare/timeline chat modes and UI upload. Filter WebSocket progress by user ACL and normalize admin project slugs consistently. Co-authored-by: Cursor <cursoragent@cursor.com> 2026-06-01 16:16:23 +00:00			`"""Tests for document ingestion pipeline."""`

			`import json`
			`import sys`
			`import tempfile`
			`import unittest`
			`from pathlib import Path`

			`ROOT = Path(__file__).resolve().parent.parent`
			`sys.path.insert(0, str(ROOT))`

			`from src.ingest.classify import _parse_json`
			`from src.ingest.formatter import format_index_document`
			`from src.ingest.models import NormalizedDocument, DocumentChunk`
Replace LightRAG with native Python RAG engine + add deploy tooling - New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank) - New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*) - New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives) - New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example - Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/ - Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile - Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim, RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged. Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'. Models (5.83 GB) live outside git; pulled via rsync from dev host. 2026-06-10 11:24:01 +00:00			`from src.ingest.router import (`
			`extract_document,`
			`is_audio_file,`
			`is_document_file,`
			`is_extractable,`
			`resolve_upload_kind,`
			`)`
			`from src.ingest.stub_writer import write_stub`
Add document ingestion pipeline, chat analytics modes, and auth fixes Ingest MD/PDF/DOCX/XLSX into org-scoped documents with classify and RAG indexing. Add compare/timeline chat modes and UI upload. Filter WebSocket progress by user ACL and normalize admin project slugs consistently. Co-authored-by: Cursor <cursoragent@cursor.com> 2026-06-01 16:16:23 +00:00

			`def test_is_audio_file():`
			`assert is_audio_file("meeting.mp4") is True`
			`assert is_audio_file("notes.pdf") is False`


			`def test_is_document_file():`
			`assert is_document_file("spec.pdf") is True`
			`assert is_document_file("audio.wav") is False`
			`assert is_document_file("data.xlsx") is True`


			`def test_extract_text_md():`
			`with tempfile.TemporaryDirectory() as tmp:`
			`md = Path(tmp) / "note.md"`
			`md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")`
			`doc = extract_document(md, "test-project", "specification")`
			`assert doc.full_text`
			`assert "Текст документа" in doc.full_text`
			`assert doc.project == "test-project"`
			`assert doc.doc_type == "specification"`


			`def test_extract_csv():`
			`with tempfile.TemporaryDirectory() as tmp:`
			`csv_path = Path(tmp) / "data.csv"`
			`csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")`
			`doc = extract_document(csv_path, "gp-2026", "estimate")`
			`assert "col1" in doc.full_text`


			`def test_format_index_document():`
			`doc = NormalizedDocument(`
			`document_id="doc_test",`
			`filename="test.md",`
			`doc_type="report",`
			`project="2026",`
			`full_text="Содержание отчёта",`
			`chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],`
			`)`
			`metadata = {`
			`"title": "Отчёт Q1",`
			`"topic": "Финансы",`
			`"summary": "Кратко",`
			`"key_decisions": ["Утвердить бюджет"],`
			`}`
			`text = format_index_document(doc, metadata)`
			`assert "Отчёт Q1" in text`
			`assert "Утвердить бюджет" in text`
			`assert "Содержание отчёта" in text`


			`def test_parse_classify_json():`
			raw = """```json
			`{"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}`
			```"""
			`meta = _parse_json(raw, "2026", "other")`
			`assert meta["doc_type"] == "contract"`
			`assert meta["title"] == "Договор"`


			`def test_resolve_upload_kind():`
			`assert resolve_upload_kind("spec.pdf") == "document"`
			`assert resolve_upload_kind("call.mp3") == "audio"`
			`try:`
			`resolve_upload_kind("image.png")`
			`assert False, "expected ValueError"`
			`except ValueError:`
			`pass`


Replace LightRAG with native Python RAG engine + add deploy tooling - New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank) - New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*) - New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives) - New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example - Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/ - Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile - Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim, RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged. Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'. Models (5.83 GB) live outside git; pulled via rsync from dev host. 2026-06-10 11:24:01 +00:00			`def test_is_extractable():`
			`assert is_extractable("spec.pdf") is True`
			`assert is_extractable("data.csv") is True`
			`assert is_extractable("notes.md") is True`
			`assert is_extractable("video.mp4") is False`
			`assert is_extractable("archive.zip") is False`


			`def test_write_stub_for_binary():`
			`with tempfile.TemporaryDirectory() as tmp:`
			`mp4 = Path(tmp) / "movie.mp4"`
			`mp4.write_bytes(b"\x00" * 8)`
			`stub = write_stub(mp4, project="2026")`
			`assert stub.exists()`
			`text = stub.read_text(encoding="utf-8")`
			`assert "kind: video" in text`
			`assert "project: 2026" in text`


Add document ingestion pipeline, chat analytics modes, and auth fixes Ingest MD/PDF/DOCX/XLSX into org-scoped documents with classify and RAG indexing. Add compare/timeline chat modes and UI upload. Filter WebSocket progress by user ACL and normalize admin project slugs consistently. Co-authored-by: Cursor <cursoragent@cursor.com> 2026-06-01 16:16:23 +00:00			`if __name__ == "__main__":`
			`class IngestTestCase(unittest.TestCase):`
			`def test_audio(self):`
			`test_is_audio_file()`

			`def test_document(self):`
			`test_is_document_file()`

			`def test_parse(self):`
			`test_parse_classify_json()`

			`def test_format(self):`
			`test_format_index_document()`

			`def test_extract_md(self):`
			`test_extract_text_md()`

			`def test_extract_csv(self):`
			`test_extract_csv()`

			`def test_route(self):`
			`test_resolve_upload_kind()`

Replace LightRAG with native Python RAG engine + add deploy tooling - New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank) - New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*) - New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives) - New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example - Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/ - Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile - Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim, RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged. Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'. Models (5.83 GB) live outside git; pulled via rsync from dev host. 2026-06-10 11:24:01 +00:00			`def test_extractable(self):`
			`test_is_extractable()`

			`def test_stub(self):`
			`test_write_stub_for_binary()`

Add document ingestion pipeline, chat analytics modes, and auth fixes Ingest MD/PDF/DOCX/XLSX into org-scoped documents with classify and RAG indexing. Add compare/timeline chat modes and UI upload. Filter WebSocket progress by user ACL and normalize admin project slugs consistently. Co-authored-by: Cursor <cursoragent@cursor.com> 2026-06-01 16:16:23 +00:00			`unittest.main(verbosity=2)`