transcription/tests/test_stub_writer.py
keboss-m eee8f4c8a4 Replace LightRAG with native Python RAG engine + add deploy tooling
- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank)
- New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*)
- New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives)
- New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example
- Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/
- Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile
- Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README

Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim,
RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged.

Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'.
Models (5.83 GB) live outside git; pulled via rsync from dev host.
2026-06-10 14:24:01 +03:00

67 lines
2.5 KiB
Python

"""Tests for stub_writer (binary file -> .md frontmatter stub)."""
import tempfile
import unittest
from pathlib import Path
from src.ingest.stub_writer import write_stub, _infer_kind
class StubWriterTestCase(unittest.TestCase):
def test_infer_kind_video(self):
self.assertEqual(_infer_kind(Path("meeting.mp4")), "video")
self.assertEqual(_infer_kind(Path("recording.MKV")), "video")
def test_infer_kind_audio(self):
self.assertEqual(_infer_kind(Path("track.wav")), "audio")
def test_infer_kind_image(self):
self.assertEqual(_infer_kind(Path("photo.png")), "image")
def test_infer_kind_archive(self):
self.assertEqual(_infer_kind(Path("backup.zip")), "archive")
def test_infer_kind_other(self):
self.assertEqual(_infer_kind(Path("unknown.xyz")), "other")
def test_write_stub_creates_md(self):
with tempfile.TemporaryDirectory() as tmp:
mp4 = Path(tmp) / "video.mp4"
mp4.write_bytes(b"fake-mp4-content")
stub = write_stub(mp4, project="2026")
self.assertEqual(stub, mp4.with_suffix(".mp4.md"))
self.assertTrue(stub.exists())
text = stub.read_text(encoding="utf-8")
self.assertIn("source: video.mp4", text)
self.assertIn("kind: video", text)
self.assertIn("project: 2026", text)
self.assertIn("size: 16", text)
self.assertIn("Бинарный файл", text)
def test_write_stub_overwrites(self):
with tempfile.TemporaryDirectory() as tmp:
mp4 = Path(tmp) / "video.mp4"
mp4.write_bytes(b"a" * 10)
stub = write_stub(mp4, project="p1")
stub.write_text("OLD", encoding="utf-8")
stub2 = write_stub(mp4, project="p1")
self.assertEqual(stub, stub2)
self.assertNotIn("OLD", stub.read_text(encoding="utf-8"))
def test_write_stub_missing_file_raises(self):
with tempfile.TemporaryDirectory() as tmp:
missing = Path(tmp) / "nope.mp4"
with self.assertRaises(FileNotFoundError):
write_stub(missing, project="x")
def test_explicit_kind_overrides_inference(self):
with tempfile.TemporaryDirectory() as tmp:
f = Path(tmp) / "weird.bin"
f.write_bytes(b"x")
stub = write_stub(f, project="p", kind="custom")
self.assertIn("kind: custom", stub.read_text(encoding="utf-8"))
if __name__ == "__main__":
unittest.main()