Ingest MD/PDF/DOCX/XLSX into org-scoped documents with classify and RAG indexing. Add compare/timeline chat modes and UI upload. Filter WebSocket progress by user ACL and normalize admin project slugs consistently. Co-authored-by: Cursor <cursoragent@cursor.com>
112 lines
3.3 KiB
Python
112 lines
3.3 KiB
Python
"""Tests for document ingestion pipeline."""
|
|
|
|
import json
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from src.ingest.classify import _parse_json
|
|
from src.ingest.formatter import format_index_document
|
|
from src.ingest.models import NormalizedDocument, DocumentChunk
|
|
from src.ingest.router import extract_document, is_audio_file, is_document_file, resolve_upload_kind
|
|
|
|
|
|
def test_is_audio_file():
|
|
assert is_audio_file("meeting.mp4") is True
|
|
assert is_audio_file("notes.pdf") is False
|
|
|
|
|
|
def test_is_document_file():
|
|
assert is_document_file("spec.pdf") is True
|
|
assert is_document_file("audio.wav") is False
|
|
assert is_document_file("data.xlsx") is True
|
|
|
|
|
|
def test_extract_text_md():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
md = Path(tmp) / "note.md"
|
|
md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")
|
|
doc = extract_document(md, "test-project", "specification")
|
|
assert doc.full_text
|
|
assert "Текст документа" in doc.full_text
|
|
assert doc.project == "test-project"
|
|
assert doc.doc_type == "specification"
|
|
|
|
|
|
def test_extract_csv():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
csv_path = Path(tmp) / "data.csv"
|
|
csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")
|
|
doc = extract_document(csv_path, "gp-2026", "estimate")
|
|
assert "col1" in doc.full_text
|
|
|
|
|
|
def test_format_index_document():
|
|
doc = NormalizedDocument(
|
|
document_id="doc_test",
|
|
filename="test.md",
|
|
doc_type="report",
|
|
project="2026",
|
|
full_text="Содержание отчёта",
|
|
chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],
|
|
)
|
|
metadata = {
|
|
"title": "Отчёт Q1",
|
|
"topic": "Финансы",
|
|
"summary": "Кратко",
|
|
"key_decisions": ["Утвердить бюджет"],
|
|
}
|
|
text = format_index_document(doc, metadata)
|
|
assert "Отчёт Q1" in text
|
|
assert "Утвердить бюджет" in text
|
|
assert "Содержание отчёта" in text
|
|
|
|
|
|
def test_parse_classify_json():
|
|
raw = """```json
|
|
{"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}
|
|
```"""
|
|
meta = _parse_json(raw, "2026", "other")
|
|
assert meta["doc_type"] == "contract"
|
|
assert meta["title"] == "Договор"
|
|
|
|
|
|
def test_resolve_upload_kind():
|
|
assert resolve_upload_kind("spec.pdf") == "document"
|
|
assert resolve_upload_kind("call.mp3") == "audio"
|
|
try:
|
|
resolve_upload_kind("image.png")
|
|
assert False, "expected ValueError"
|
|
except ValueError:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
class IngestTestCase(unittest.TestCase):
|
|
def test_audio(self):
|
|
test_is_audio_file()
|
|
|
|
def test_document(self):
|
|
test_is_document_file()
|
|
|
|
def test_parse(self):
|
|
test_parse_classify_json()
|
|
|
|
def test_format(self):
|
|
test_format_index_document()
|
|
|
|
def test_extract_md(self):
|
|
test_extract_text_md()
|
|
|
|
def test_extract_csv(self):
|
|
test_extract_csv()
|
|
|
|
def test_route(self):
|
|
test_resolve_upload_kind()
|
|
|
|
unittest.main(verbosity=2)
|