transcription/tests/test_ingest.py

144 lines
4.1 KiB
Python
Raw Permalink Normal View History

"""Tests for document ingestion pipeline."""
import json
import sys
import tempfile
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from src.ingest.classify import _parse_json
from src.ingest.formatter import format_index_document
from src.ingest.models import NormalizedDocument, DocumentChunk
from src.ingest.router import (
extract_document,
is_audio_file,
is_document_file,
is_extractable,
resolve_upload_kind,
)
from src.ingest.stub_writer import write_stub
def test_is_audio_file():
assert is_audio_file("meeting.mp4") is True
assert is_audio_file("notes.pdf") is False
def test_is_document_file():
assert is_document_file("spec.pdf") is True
assert is_document_file("audio.wav") is False
assert is_document_file("data.xlsx") is True
def test_extract_text_md():
with tempfile.TemporaryDirectory() as tmp:
md = Path(tmp) / "note.md"
md.write_text("# Заголовок\n\nТекст документа.", encoding="utf-8")
doc = extract_document(md, "test-project", "specification")
assert doc.full_text
assert "Текст документа" in doc.full_text
assert doc.project == "test-project"
assert doc.doc_type == "specification"
def test_extract_csv():
with tempfile.TemporaryDirectory() as tmp:
csv_path = Path(tmp) / "data.csv"
csv_path.write_text("col1,col2\na,b\n", encoding="utf-8")
doc = extract_document(csv_path, "gp-2026", "estimate")
assert "col1" in doc.full_text
def test_format_index_document():
doc = NormalizedDocument(
document_id="doc_test",
filename="test.md",
doc_type="report",
project="2026",
full_text="Содержание отчёта",
chunks=[DocumentChunk(text="Содержание отчёта", source="test.md")],
)
metadata = {
"title": "Отчёт Q1",
"topic": "Финансы",
"summary": "Кратко",
"key_decisions": ["Утвердить бюджет"],
}
text = format_index_document(doc, metadata)
assert "Отчёт Q1" in text
assert "Утвердить бюджет" in text
assert "Содержание отчёта" in text
def test_parse_classify_json():
raw = """```json
{"project": "2026", "doc_type": "contract", "title": "Договор", "topic": "Субподряд"}
```"""
meta = _parse_json(raw, "2026", "other")
assert meta["doc_type"] == "contract"
assert meta["title"] == "Договор"
def test_resolve_upload_kind():
assert resolve_upload_kind("spec.pdf") == "document"
assert resolve_upload_kind("call.mp3") == "audio"
try:
resolve_upload_kind("image.png")
assert False, "expected ValueError"
except ValueError:
pass
def test_is_extractable():
assert is_extractable("spec.pdf") is True
assert is_extractable("data.csv") is True
assert is_extractable("notes.md") is True
assert is_extractable("video.mp4") is False
assert is_extractable("archive.zip") is False
def test_write_stub_for_binary():
with tempfile.TemporaryDirectory() as tmp:
mp4 = Path(tmp) / "movie.mp4"
mp4.write_bytes(b"\x00" * 8)
stub = write_stub(mp4, project="2026")
assert stub.exists()
text = stub.read_text(encoding="utf-8")
assert "kind: video" in text
assert "project: 2026" in text
if __name__ == "__main__":
class IngestTestCase(unittest.TestCase):
def test_audio(self):
test_is_audio_file()
def test_document(self):
test_is_document_file()
def test_parse(self):
test_parse_classify_json()
def test_format(self):
test_format_index_document()
def test_extract_md(self):
test_extract_text_md()
def test_extract_csv(self):
test_extract_csv()
def test_route(self):
test_resolve_upload_kind()
def test_extractable(self):
test_is_extractable()
def test_stub(self):
test_write_stub_for_binary()
unittest.main(verbosity=2)