""" Unit tests for Markdown intelligent chunker. Tests for MarkdownParser, MarkdownChunker, and integration. """ import pytest from app.services.document.markdown_chunker import ( MarkdownChunk, MarkdownChunker, MarkdownElement, MarkdownElementType, MarkdownParser, chunk_markdown, ) class TestMarkdownParser: """Tests for MarkdownParser.""" def test_parse_headers(self): """Test header extraction.""" text = """# Main Title ## Section 1 ### Subsection 1.1 #### Deep Header """ parser = MarkdownParser() elements = parser.parse(text) headers = [e for e in elements if e.type == MarkdownElementType.HEADER] assert len(headers) == 4 assert headers[0].content == "Main Title" assert headers[0].level == 1 assert headers[1].content == "Section 1" assert headers[1].level == 2 assert headers[2].content == "Subsection 1.1" assert headers[2].level == 3 assert headers[3].content == "Deep Header" assert headers[3].level == 4 def test_parse_code_blocks(self): """Test code block extraction with language.""" text = """Here is some code: ```python def hello(): print("Hello, World!") ``` And some more text. """ parser = MarkdownParser() elements = parser.parse(text) code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK] assert len(code_blocks) == 1 assert code_blocks[0].language == "python" assert 'def hello():' in code_blocks[0].content assert 'print("Hello, World!")' in code_blocks[0].content def test_parse_code_blocks_no_language(self): """Test code block without language specification.""" text = """``` plain code here multiple lines ``` """ parser = MarkdownParser() elements = parser.parse(text) code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK] assert len(code_blocks) == 1 assert code_blocks[0].language == "" assert "plain code here" in code_blocks[0].content def test_parse_tables(self): """Test table extraction.""" text = """| Name | Age | City | |------|-----|------| | Alice | 30 | NYC | | Bob | 25 | LA | """ parser = MarkdownParser() elements = parser.parse(text) tables = [e for e in elements if e.type == MarkdownElementType.TABLE] assert len(tables) == 1 assert "Name" in tables[0].content assert "Alice" in tables[0].content assert tables[0].metadata.get("headers") == ["Name", "Age", "City"] assert tables[0].metadata.get("row_count") == 2 def test_parse_lists(self): """Test list extraction.""" text = """- Item 1 - Item 2 - Item 3 """ parser = MarkdownParser() elements = parser.parse(text) lists = [e for e in elements if e.type == MarkdownElementType.LIST] assert len(lists) == 1 assert "Item 1" in lists[0].content assert "Item 2" in lists[0].content assert "Item 3" in lists[0].content def test_parse_ordered_lists(self): """Test ordered list extraction.""" text = """1. First 2. Second 3. Third """ parser = MarkdownParser() elements = parser.parse(text) lists = [e for e in elements if e.type == MarkdownElementType.LIST] assert len(lists) == 1 assert "First" in lists[0].content assert "Second" in lists[0].content assert "Third" in lists[0].content def test_parse_blockquotes(self): """Test blockquote extraction.""" text = """> This is a quote. > It spans multiple lines. > And continues here. """ parser = MarkdownParser() elements = parser.parse(text) quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE] assert len(quotes) == 1 assert "This is a quote." in quotes[0].content assert "It spans multiple lines." in quotes[0].content def test_parse_paragraphs(self): """Test paragraph extraction.""" text = """This is the first paragraph. This is the second paragraph. It has multiple lines. This is the third. """ parser = MarkdownParser() elements = parser.parse(text) paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH] assert len(paragraphs) == 3 assert "first paragraph" in paragraphs[0].content assert "second paragraph" in paragraphs[1].content def test_parse_mixed_content(self): """Test parsing mixed Markdown content.""" text = """# Documentation ## Introduction This is an introduction paragraph. ## Code Example ```python def example(): return 42 ``` ## Data Table | Column A | Column B | |----------|----------| | Value 1 | Value 2 | ## List - Item A - Item B > Note: This is important. """ parser = MarkdownParser() elements = parser.parse(text) headers = [e for e in elements if e.type == MarkdownElementType.HEADER] code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK] tables = [e for e in elements if e.type == MarkdownElementType.TABLE] lists = [e for e in elements if e.type == MarkdownElementType.LIST] quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE] paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH] assert len(headers) == 5 assert len(code_blocks) == 1 assert len(tables) == 1 assert len(lists) == 1 assert len(quotes) == 1 assert len(paragraphs) >= 1 def test_code_blocks_not_parsed_as_other_elements(self): """Test that code blocks don't get parsed as headers or lists.""" text = """```markdown # This is not a header - This is not a list | This is not a table | ``` """ parser = MarkdownParser() elements = parser.parse(text) headers = [e for e in elements if e.type == MarkdownElementType.HEADER] lists = [e for e in elements if e.type == MarkdownElementType.LIST] tables = [e for e in elements if e.type == MarkdownElementType.TABLE] code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK] assert len(headers) == 0 assert len(lists) == 0 assert len(tables) == 0 assert len(code_blocks) == 1 class TestMarkdownChunker: """Tests for MarkdownChunker.""" def test_chunk_simple_document(self): """Test chunking a simple document.""" text = """# Title This is a paragraph. ## Section Another paragraph. """ chunker = MarkdownChunker() chunks = chunker.chunk(text, "test_doc") assert len(chunks) >= 2 assert all(isinstance(chunk, MarkdownChunk) for chunk in chunks) assert all(chunk.chunk_id.startswith("test_doc") for chunk in chunks) def test_chunk_preserves_header_context(self): """Test that header context is preserved.""" text = """# Main Title ## Section A Content under section A. ### Subsection A1 Content under subsection A1. """ chunker = MarkdownChunker(include_header_context=True) chunks = chunker.chunk(text, "test") subsection_chunks = [c for c in chunks if "Subsection A1" not in c.content] for chunk in subsection_chunks: if "subsection a1" in chunk.content.lower(): assert "Main Title" in chunk.header_context assert "Section A" in chunk.header_context def test_chunk_code_blocks_preserved(self): """Test that code blocks are preserved as single chunks when possible.""" text = """```python def function_one(): pass def function_two(): pass ``` """ chunker = MarkdownChunker(max_chunk_size=2000, preserve_code_blocks=True) chunks = chunker.chunk(text, "test") code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK] assert len(code_chunks) == 1 assert "def function_one" in code_chunks[0].content assert "def function_two" in code_chunks[0].content assert code_chunks[0].language == "python" def test_chunk_large_code_block_split(self): """Test that large code blocks are split properly.""" lines = ["def function_{}(): pass".format(i) for i in range(100)] code_content = "\n".join(lines) text = f"""```python\n{code_content}\n```""" chunker = MarkdownChunker(max_chunk_size=500, preserve_code_blocks=True) chunks = chunker.chunk(text, "test") code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK] assert len(code_chunks) > 1 for chunk in code_chunks: assert chunk.language == "python" assert "```python" in chunk.content assert "```" in chunk.content def test_chunk_table_preserved(self): """Test that tables are preserved.""" text = """| Name | Age | |------|-----| | Alice | 30 | | Bob | 25 | """ chunker = MarkdownChunker(max_chunk_size=2000, preserve_tables=True) chunks = chunker.chunk(text, "test") table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE] assert len(table_chunks) == 1 assert "Alice" in table_chunks[0].content assert "Bob" in table_chunks[0].content def test_chunk_large_table_split(self): """Test that large tables are split with header preserved.""" rows = [f"| Name{i} | {i * 10} |" for i in range(50)] table_content = "| Name | Age |\n|------|-----|\n" + "\n".join(rows) text = table_content chunker = MarkdownChunker(max_chunk_size=200, preserve_tables=True) chunks = chunker.chunk(text, "test") table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE] assert len(table_chunks) > 1 for chunk in table_chunks: assert "| Name | Age |" in chunk.content assert "|------|-----|" in chunk.content def test_chunk_list_preserved(self): """Test that lists are chunked properly.""" text = """- Item 1 - Item 2 - Item 3 - Item 4 - Item 5 """ chunker = MarkdownChunker(max_chunk_size=2000, preserve_lists=True) chunks = chunker.chunk(text, "test") list_chunks = [c for c in chunks if c.element_type == MarkdownElementType.LIST] assert len(list_chunks) == 1 assert "Item 1" in list_chunks[0].content assert "Item 5" in list_chunks[0].content def test_chunk_empty_document(self): """Test chunking an empty document.""" text = "" chunker = MarkdownChunker() chunks = chunker.chunk(text, "test") assert len(chunks) == 0 def test_chunk_only_headers(self): """Test chunking a document with only headers.""" text = """# Title 1 ## Title 2 ### Title 3 """ chunker = MarkdownChunker() chunks = chunker.chunk(text, "test") assert len(chunks) == 0 class TestChunkMarkdownFunction: """Tests for the convenience chunk_markdown function.""" def test_basic_chunking(self): """Test basic chunking via convenience function.""" text = """# Title Content paragraph. ```python code = "here" ``` """ chunks = chunk_markdown(text, "doc1") assert len(chunks) >= 1 assert all("chunk_id" in chunk for chunk in chunks) assert all("content" in chunk for chunk in chunks) assert all("element_type" in chunk for chunk in chunks) assert all("header_context" in chunk for chunk in chunks) def test_custom_parameters(self): """Test chunking with custom parameters.""" text = "A" * 2000 chunks = chunk_markdown( text, "doc1", max_chunk_size=500, min_chunk_size=50, preserve_code_blocks=False, preserve_tables=False, preserve_lists=False, include_header_context=False, ) assert len(chunks) >= 1 class TestMarkdownElement: """Tests for MarkdownElement dataclass.""" def test_to_dict(self): """Test serialization to dictionary.""" elem = MarkdownElement( type=MarkdownElementType.HEADER, content="Test Header", level=2, line_start=10, line_end=10, metadata={"level": 2}, ) result = elem.to_dict() assert result["type"] == "header" assert result["content"] == "Test Header" assert result["level"] == 2 assert result["line_start"] == 10 assert result["line_end"] == 10 def test_code_block_with_language(self): """Test code block element with language.""" elem = MarkdownElement( type=MarkdownElementType.CODE_BLOCK, content="print('hello')", language="python", line_start=5, line_end=7, ) result = elem.to_dict() assert result["type"] == "code_block" assert result["language"] == "python" def test_table_with_metadata(self): """Test table element with metadata.""" elem = MarkdownElement( type=MarkdownElementType.TABLE, content="| A | B |\n|---|---|\n| 1 | 2 |", line_start=1, line_end=3, metadata={"headers": ["A", "B"], "row_count": 1}, ) result = elem.to_dict() assert result["type"] == "table" assert result["metadata"]["headers"] == ["A", "B"] assert result["metadata"]["row_count"] == 1 class TestMarkdownChunk: """Tests for MarkdownChunk dataclass.""" def test_to_dict(self): """Test serialization to dictionary.""" chunk = MarkdownChunk( chunk_id="doc_chunk_0", content="Test content", element_type=MarkdownElementType.PARAGRAPH, header_context=["Main Title", "Section"], metadata={"key": "value"}, ) result = chunk.to_dict() assert result["chunk_id"] == "doc_chunk_0" assert result["content"] == "Test content" assert result["element_type"] == "paragraph" assert result["header_context"] == ["Main Title", "Section"] assert result["metadata"]["key"] == "value" def test_with_language(self): """Test chunk with language info.""" chunk = MarkdownChunk( chunk_id="code_0", content="```python\nprint('hi')\n```", element_type=MarkdownElementType.CODE_BLOCK, header_context=[], language="python", ) result = chunk.to_dict() assert result["language"] == "python" class TestMarkdownElementType: """Tests for MarkdownElementType enum.""" def test_all_types_exist(self): """Test that all expected element types exist.""" expected_types = [ "header", "paragraph", "code_block", "inline_code", "table", "list", "blockquote", "horizontal_rule", "image", "link", "text", ] for type_name in expected_types: assert hasattr(MarkdownElementType, type_name.upper()) or \ any(t.value == type_name for t in MarkdownElementType)