ai-robot-core/ai-service/tests/test_markdown_chunker.py

531 lines
15 KiB
Python

"""
Unit tests for Markdown intelligent chunker.
Tests for MarkdownParser, MarkdownChunker, and integration.
"""
import pytest
from app.services.document.markdown_chunker import (
MarkdownChunk,
MarkdownChunker,
MarkdownElement,
MarkdownElementType,
MarkdownParser,
chunk_markdown,
)
class TestMarkdownParser:
"""Tests for MarkdownParser."""
def test_parse_headers(self):
"""Test header extraction."""
text = """# Main Title
## Section 1
### Subsection 1.1
#### Deep Header
"""
parser = MarkdownParser()
elements = parser.parse(text)
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
assert len(headers) == 4
assert headers[0].content == "Main Title"
assert headers[0].level == 1
assert headers[1].content == "Section 1"
assert headers[1].level == 2
assert headers[2].content == "Subsection 1.1"
assert headers[2].level == 3
assert headers[3].content == "Deep Header"
assert headers[3].level == 4
def test_parse_code_blocks(self):
"""Test code block extraction with language."""
text = """Here is some code:
```python
def hello():
print("Hello, World!")
```
And some more text.
"""
parser = MarkdownParser()
elements = parser.parse(text)
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
assert len(code_blocks) == 1
assert code_blocks[0].language == "python"
assert 'def hello():' in code_blocks[0].content
assert 'print("Hello, World!")' in code_blocks[0].content
def test_parse_code_blocks_no_language(self):
"""Test code block without language specification."""
text = """```
plain code here
multiple lines
```
"""
parser = MarkdownParser()
elements = parser.parse(text)
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
assert len(code_blocks) == 1
assert code_blocks[0].language == ""
assert "plain code here" in code_blocks[0].content
def test_parse_tables(self):
"""Test table extraction."""
text = """| Name | Age | City |
|------|-----|------|
| Alice | 30 | NYC |
| Bob | 25 | LA |
"""
parser = MarkdownParser()
elements = parser.parse(text)
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
assert len(tables) == 1
assert "Name" in tables[0].content
assert "Alice" in tables[0].content
assert tables[0].metadata.get("headers") == ["Name", "Age", "City"]
assert tables[0].metadata.get("row_count") == 2
def test_parse_lists(self):
"""Test list extraction."""
text = """- Item 1
- Item 2
- Item 3
"""
parser = MarkdownParser()
elements = parser.parse(text)
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
assert len(lists) == 1
assert "Item 1" in lists[0].content
assert "Item 2" in lists[0].content
assert "Item 3" in lists[0].content
def test_parse_ordered_lists(self):
"""Test ordered list extraction."""
text = """1. First
2. Second
3. Third
"""
parser = MarkdownParser()
elements = parser.parse(text)
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
assert len(lists) == 1
assert "First" in lists[0].content
assert "Second" in lists[0].content
assert "Third" in lists[0].content
def test_parse_blockquotes(self):
"""Test blockquote extraction."""
text = """> This is a quote.
> It spans multiple lines.
> And continues here.
"""
parser = MarkdownParser()
elements = parser.parse(text)
quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE]
assert len(quotes) == 1
assert "This is a quote." in quotes[0].content
assert "It spans multiple lines." in quotes[0].content
def test_parse_paragraphs(self):
"""Test paragraph extraction."""
text = """This is the first paragraph.
This is the second paragraph.
It has multiple lines.
This is the third.
"""
parser = MarkdownParser()
elements = parser.parse(text)
paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH]
assert len(paragraphs) == 3
assert "first paragraph" in paragraphs[0].content
assert "second paragraph" in paragraphs[1].content
def test_parse_mixed_content(self):
"""Test parsing mixed Markdown content."""
text = """# Documentation
## Introduction
This is an introduction paragraph.
## Code Example
```python
def example():
return 42
```
## Data Table
| Column A | Column B |
|----------|----------|
| Value 1 | Value 2 |
## List
- Item A
- Item B
> Note: This is important.
"""
parser = MarkdownParser()
elements = parser.parse(text)
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE]
paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH]
assert len(headers) == 5
assert len(code_blocks) == 1
assert len(tables) == 1
assert len(lists) == 1
assert len(quotes) == 1
assert len(paragraphs) >= 1
def test_code_blocks_not_parsed_as_other_elements(self):
"""Test that code blocks don't get parsed as headers or lists."""
text = """```markdown
# This is not a header
- This is not a list
| This is not a table |
```
"""
parser = MarkdownParser()
elements = parser.parse(text)
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
assert len(headers) == 0
assert len(lists) == 0
assert len(tables) == 0
assert len(code_blocks) == 1
class TestMarkdownChunker:
"""Tests for MarkdownChunker."""
def test_chunk_simple_document(self):
"""Test chunking a simple document."""
text = """# Title
This is a paragraph.
## Section
Another paragraph.
"""
chunker = MarkdownChunker()
chunks = chunker.chunk(text, "test_doc")
assert len(chunks) >= 2
assert all(isinstance(chunk, MarkdownChunk) for chunk in chunks)
assert all(chunk.chunk_id.startswith("test_doc") for chunk in chunks)
def test_chunk_preserves_header_context(self):
"""Test that header context is preserved."""
text = """# Main Title
## Section A
Content under section A.
### Subsection A1
Content under subsection A1.
"""
chunker = MarkdownChunker(include_header_context=True)
chunks = chunker.chunk(text, "test")
subsection_chunks = [c for c in chunks if "Subsection A1" not in c.content]
for chunk in subsection_chunks:
if "subsection a1" in chunk.content.lower():
assert "Main Title" in chunk.header_context
assert "Section A" in chunk.header_context
def test_chunk_code_blocks_preserved(self):
"""Test that code blocks are preserved as single chunks when possible."""
text = """```python
def function_one():
pass
def function_two():
pass
```
"""
chunker = MarkdownChunker(max_chunk_size=2000, preserve_code_blocks=True)
chunks = chunker.chunk(text, "test")
code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK]
assert len(code_chunks) == 1
assert "def function_one" in code_chunks[0].content
assert "def function_two" in code_chunks[0].content
assert code_chunks[0].language == "python"
def test_chunk_large_code_block_split(self):
"""Test that large code blocks are split properly."""
lines = ["def function_{}(): pass".format(i) for i in range(100)]
code_content = "\n".join(lines)
text = f"""```python\n{code_content}\n```"""
chunker = MarkdownChunker(max_chunk_size=500, preserve_code_blocks=True)
chunks = chunker.chunk(text, "test")
code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK]
assert len(code_chunks) > 1
for chunk in code_chunks:
assert chunk.language == "python"
assert "```python" in chunk.content
assert "```" in chunk.content
def test_chunk_table_preserved(self):
"""Test that tables are preserved."""
text = """| Name | Age |
|------|-----|
| Alice | 30 |
| Bob | 25 |
"""
chunker = MarkdownChunker(max_chunk_size=2000, preserve_tables=True)
chunks = chunker.chunk(text, "test")
table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE]
assert len(table_chunks) == 1
assert "Alice" in table_chunks[0].content
assert "Bob" in table_chunks[0].content
def test_chunk_large_table_split(self):
"""Test that large tables are split with header preserved."""
rows = [f"| Name{i} | {i * 10} |" for i in range(50)]
table_content = "| Name | Age |\n|------|-----|\n" + "\n".join(rows)
text = table_content
chunker = MarkdownChunker(max_chunk_size=200, preserve_tables=True)
chunks = chunker.chunk(text, "test")
table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE]
assert len(table_chunks) > 1
for chunk in table_chunks:
assert "| Name | Age |" in chunk.content
assert "|------|-----|" in chunk.content
def test_chunk_list_preserved(self):
"""Test that lists are chunked properly."""
text = """- Item 1
- Item 2
- Item 3
- Item 4
- Item 5
"""
chunker = MarkdownChunker(max_chunk_size=2000, preserve_lists=True)
chunks = chunker.chunk(text, "test")
list_chunks = [c for c in chunks if c.element_type == MarkdownElementType.LIST]
assert len(list_chunks) == 1
assert "Item 1" in list_chunks[0].content
assert "Item 5" in list_chunks[0].content
def test_chunk_empty_document(self):
"""Test chunking an empty document."""
text = ""
chunker = MarkdownChunker()
chunks = chunker.chunk(text, "test")
assert len(chunks) == 0
def test_chunk_only_headers(self):
"""Test chunking a document with only headers."""
text = """# Title 1
## Title 2
### Title 3
"""
chunker = MarkdownChunker()
chunks = chunker.chunk(text, "test")
assert len(chunks) == 0
class TestChunkMarkdownFunction:
"""Tests for the convenience chunk_markdown function."""
def test_basic_chunking(self):
"""Test basic chunking via convenience function."""
text = """# Title
Content paragraph.
```python
code = "here"
```
"""
chunks = chunk_markdown(text, "doc1")
assert len(chunks) >= 1
assert all("chunk_id" in chunk for chunk in chunks)
assert all("content" in chunk for chunk in chunks)
assert all("element_type" in chunk for chunk in chunks)
assert all("header_context" in chunk for chunk in chunks)
def test_custom_parameters(self):
"""Test chunking with custom parameters."""
text = "A" * 2000
chunks = chunk_markdown(
text,
"doc1",
max_chunk_size=500,
min_chunk_size=50,
preserve_code_blocks=False,
preserve_tables=False,
preserve_lists=False,
include_header_context=False,
)
assert len(chunks) >= 1
class TestMarkdownElement:
"""Tests for MarkdownElement dataclass."""
def test_to_dict(self):
"""Test serialization to dictionary."""
elem = MarkdownElement(
type=MarkdownElementType.HEADER,
content="Test Header",
level=2,
line_start=10,
line_end=10,
metadata={"level": 2},
)
result = elem.to_dict()
assert result["type"] == "header"
assert result["content"] == "Test Header"
assert result["level"] == 2
assert result["line_start"] == 10
assert result["line_end"] == 10
def test_code_block_with_language(self):
"""Test code block element with language."""
elem = MarkdownElement(
type=MarkdownElementType.CODE_BLOCK,
content="print('hello')",
language="python",
line_start=5,
line_end=7,
)
result = elem.to_dict()
assert result["type"] == "code_block"
assert result["language"] == "python"
def test_table_with_metadata(self):
"""Test table element with metadata."""
elem = MarkdownElement(
type=MarkdownElementType.TABLE,
content="| A | B |\n|---|---|\n| 1 | 2 |",
line_start=1,
line_end=3,
metadata={"headers": ["A", "B"], "row_count": 1},
)
result = elem.to_dict()
assert result["type"] == "table"
assert result["metadata"]["headers"] == ["A", "B"]
assert result["metadata"]["row_count"] == 1
class TestMarkdownChunk:
"""Tests for MarkdownChunk dataclass."""
def test_to_dict(self):
"""Test serialization to dictionary."""
chunk = MarkdownChunk(
chunk_id="doc_chunk_0",
content="Test content",
element_type=MarkdownElementType.PARAGRAPH,
header_context=["Main Title", "Section"],
metadata={"key": "value"},
)
result = chunk.to_dict()
assert result["chunk_id"] == "doc_chunk_0"
assert result["content"] == "Test content"
assert result["element_type"] == "paragraph"
assert result["header_context"] == ["Main Title", "Section"]
assert result["metadata"]["key"] == "value"
def test_with_language(self):
"""Test chunk with language info."""
chunk = MarkdownChunk(
chunk_id="code_0",
content="```python\nprint('hi')\n```",
element_type=MarkdownElementType.CODE_BLOCK,
header_context=[],
language="python",
)
result = chunk.to_dict()
assert result["language"] == "python"
class TestMarkdownElementType:
"""Tests for MarkdownElementType enum."""
def test_all_types_exist(self):
"""Test that all expected element types exist."""
expected_types = [
"header",
"paragraph",
"code_block",
"inline_code",
"table",
"list",
"blockquote",
"horizontal_rule",
"image",
"link",
"text",
]
for type_name in expected_types:
assert hasattr(MarkdownElementType, type_name.upper()) or \
any(t.value == type_name for t in MarkdownElementType)