- New: src/rag/engine/ — in-process hybrid search (FTS5 BM25 + sqlite-vec + LLM rerank) - New: src/rag/qmd/ — compatibility layer (qmd_query, qmd_chat, qmd_chat_stream, qmd_index_*) - New: src/ingest/stub_writer.py — .md stubs for binary files (videos, archives) - New: scripts/deploy.sh + scripts/pull_models.sh + Makefile + .env.example - Removed: LightRAG, sentence-transformers embedding via separate package, rag_standalone/ - Removed: @nousresearch/qmd npm dep (package not published); Node.js from Dockerfile - Updated: tests/ (46 passed), docker-compose, .dockerignore, config.yaml, README Engine: in-process Python (no daemon, no npm), sentence-transformers 384-dim, RRF fusion (k=60), BM25 + vector with numpy fallback. WebSocket API unchanged. Deploy: 'git clone' + 'make init' + 'make pull-models MODELS_SOURCE=...' + 'make up'. Models (5.83 GB) live outside git; pulled via rsync from dev host.
144 lines
4.1 KiB
Python
144 lines
4.1 KiB
Python
"""Tests for document ingestion pipeline."""
|
|
|
|
import json
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from src.ingest.classify import _parse_json
|
|
from src.ingest.formatter import format_index_document
|
|
from src.ingest.models import NormalizedDocument, DocumentChunk
|
|
from src.ingest.router import (
|
|
extract_document,
|
|
is_audio_file,
|
|
is_document_file,
|
|
is_extractable,
|
|
resolve_upload_kind,
|
|
)
|
|
from src.ingest.stub_writer import write_stub
|
|
|
|
|
|
def test_is_audio_file():
|
|
assert is_audio_file("meeting.mp4") is True
|
|
assert is_audio_file("notes.pdf") is False
|
|
|
|
|
|
def test_is_document_file():
|
|
assert is_document_file("spec.pdf") is True
|
|
assert is_document_file("audio.wav") is False
|
|
assert is_document_file("data.xlsx") is True
|
|
|
|
|
|
def test_extract_text_md():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
md = Path(tmp) / "note.md"
|
|
md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")
|
|
doc = extract_document(md, "test-project", "specification")
|
|
assert doc.full_text
|
|
assert "Текст документа" in doc.full_text
|
|
assert doc.project == "test-project"
|
|
assert doc.doc_type == "specification"
|
|
|
|
|
|
def test_extract_csv():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
csv_path = Path(tmp) / "data.csv"
|
|
csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")
|
|
doc = extract_document(csv_path, "gp-2026", "estimate")
|
|
assert "col1" in doc.full_text
|
|
|
|
|
|
def test_format_index_document():
|
|
doc = NormalizedDocument(
|
|
document_id="doc_test",
|
|
filename="test.md",
|
|
doc_type="report",
|
|
project="2026",
|
|
full_text="Содержание отчёта",
|
|
chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],
|
|
)
|
|
metadata = {
|
|
"title": "Отчёт Q1",
|
|
"topic": "Финансы",
|
|
"summary": "Кратко",
|
|
"key_decisions": ["Утвердить бюджет"],
|
|
}
|
|
text = format_index_document(doc, metadata)
|
|
assert "Отчёт Q1" in text
|
|
assert "Утвердить бюджет" in text
|
|
assert "Содержание отчёта" in text
|
|
|
|
|
|
def test_parse_classify_json():
|
|
raw = """```json
|
|
{"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}
|
|
```"""
|
|
meta = _parse_json(raw, "2026", "other")
|
|
assert meta["doc_type"] == "contract"
|
|
assert meta["title"] == "Договор"
|
|
|
|
|
|
def test_resolve_upload_kind():
|
|
assert resolve_upload_kind("spec.pdf") == "document"
|
|
assert resolve_upload_kind("call.mp3") == "audio"
|
|
try:
|
|
resolve_upload_kind("image.png")
|
|
assert False, "expected ValueError"
|
|
except ValueError:
|
|
pass
|
|
|
|
|
|
def test_is_extractable():
|
|
assert is_extractable("spec.pdf") is True
|
|
assert is_extractable("data.csv") is True
|
|
assert is_extractable("notes.md") is True
|
|
assert is_extractable("video.mp4") is False
|
|
assert is_extractable("archive.zip") is False
|
|
|
|
|
|
def test_write_stub_for_binary():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
mp4 = Path(tmp) / "movie.mp4"
|
|
mp4.write_bytes(b"\x00" * 8)
|
|
stub = write_stub(mp4, project="2026")
|
|
assert stub.exists()
|
|
text = stub.read_text(encoding="utf-8")
|
|
assert "kind: video" in text
|
|
assert "project: 2026" in text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
class IngestTestCase(unittest.TestCase):
|
|
def test_audio(self):
|
|
test_is_audio_file()
|
|
|
|
def test_document(self):
|
|
test_is_document_file()
|
|
|
|
def test_parse(self):
|
|
test_parse_classify_json()
|
|
|
|
def test_format(self):
|
|
test_format_index_document()
|
|
|
|
def test_extract_md(self):
|
|
test_extract_text_md()
|
|
|
|
def test_extract_csv(self):
|
|
test_extract_csv()
|
|
|
|
def test_route(self):
|
|
test_resolve_upload_kind()
|
|
|
|
def test_extractable(self):
|
|
test_is_extractable()
|
|
|
|
def test_stub(self):
|
|
test_write_stub_for_binary()
|
|
|
|
unittest.main(verbosity=2)
|