531 lines
15 KiB
Python
531 lines
15 KiB
Python
"""
|
|
Unit tests for Markdown intelligent chunker.
|
|
Tests for MarkdownParser, MarkdownChunker, and integration.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from app.services.document.markdown_chunker import (
|
|
MarkdownChunk,
|
|
MarkdownChunker,
|
|
MarkdownElement,
|
|
MarkdownElementType,
|
|
MarkdownParser,
|
|
chunk_markdown,
|
|
)
|
|
|
|
|
|
class TestMarkdownParser:
|
|
"""Tests for MarkdownParser."""
|
|
|
|
def test_parse_headers(self):
|
|
"""Test header extraction."""
|
|
text = """# Main Title
|
|
|
|
## Section 1
|
|
|
|
### Subsection 1.1
|
|
|
|
#### Deep Header
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
|
|
|
|
assert len(headers) == 4
|
|
assert headers[0].content == "Main Title"
|
|
assert headers[0].level == 1
|
|
assert headers[1].content == "Section 1"
|
|
assert headers[1].level == 2
|
|
assert headers[2].content == "Subsection 1.1"
|
|
assert headers[2].level == 3
|
|
assert headers[3].content == "Deep Header"
|
|
assert headers[3].level == 4
|
|
|
|
def test_parse_code_blocks(self):
|
|
"""Test code block extraction with language."""
|
|
text = """Here is some code:
|
|
|
|
```python
|
|
def hello():
|
|
print("Hello, World!")
|
|
```
|
|
|
|
And some more text.
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
|
|
|
|
assert len(code_blocks) == 1
|
|
assert code_blocks[0].language == "python"
|
|
assert 'def hello():' in code_blocks[0].content
|
|
assert 'print("Hello, World!")' in code_blocks[0].content
|
|
|
|
def test_parse_code_blocks_no_language(self):
|
|
"""Test code block without language specification."""
|
|
text = """```
|
|
plain code here
|
|
multiple lines
|
|
```
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
|
|
|
|
assert len(code_blocks) == 1
|
|
assert code_blocks[0].language == ""
|
|
assert "plain code here" in code_blocks[0].content
|
|
|
|
def test_parse_tables(self):
|
|
"""Test table extraction."""
|
|
text = """| Name | Age | City |
|
|
|------|-----|------|
|
|
| Alice | 30 | NYC |
|
|
| Bob | 25 | LA |
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
|
|
|
|
assert len(tables) == 1
|
|
assert "Name" in tables[0].content
|
|
assert "Alice" in tables[0].content
|
|
assert tables[0].metadata.get("headers") == ["Name", "Age", "City"]
|
|
assert tables[0].metadata.get("row_count") == 2
|
|
|
|
def test_parse_lists(self):
|
|
"""Test list extraction."""
|
|
text = """- Item 1
|
|
- Item 2
|
|
- Item 3
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
|
|
|
|
assert len(lists) == 1
|
|
assert "Item 1" in lists[0].content
|
|
assert "Item 2" in lists[0].content
|
|
assert "Item 3" in lists[0].content
|
|
|
|
def test_parse_ordered_lists(self):
|
|
"""Test ordered list extraction."""
|
|
text = """1. First
|
|
2. Second
|
|
3. Third
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
|
|
|
|
assert len(lists) == 1
|
|
assert "First" in lists[0].content
|
|
assert "Second" in lists[0].content
|
|
assert "Third" in lists[0].content
|
|
|
|
def test_parse_blockquotes(self):
|
|
"""Test blockquote extraction."""
|
|
text = """> This is a quote.
|
|
> It spans multiple lines.
|
|
> And continues here.
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE]
|
|
|
|
assert len(quotes) == 1
|
|
assert "This is a quote." in quotes[0].content
|
|
assert "It spans multiple lines." in quotes[0].content
|
|
|
|
def test_parse_paragraphs(self):
|
|
"""Test paragraph extraction."""
|
|
text = """This is the first paragraph.
|
|
|
|
This is the second paragraph.
|
|
It has multiple lines.
|
|
|
|
This is the third.
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH]
|
|
|
|
assert len(paragraphs) == 3
|
|
assert "first paragraph" in paragraphs[0].content
|
|
assert "second paragraph" in paragraphs[1].content
|
|
|
|
def test_parse_mixed_content(self):
|
|
"""Test parsing mixed Markdown content."""
|
|
text = """# Documentation
|
|
|
|
## Introduction
|
|
|
|
This is an introduction paragraph.
|
|
|
|
## Code Example
|
|
|
|
```python
|
|
def example():
|
|
return 42
|
|
```
|
|
|
|
## Data Table
|
|
|
|
| Column A | Column B |
|
|
|----------|----------|
|
|
| Value 1 | Value 2 |
|
|
|
|
## List
|
|
|
|
- Item A
|
|
- Item B
|
|
|
|
> Note: This is important.
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
|
|
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
|
|
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
|
|
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
|
|
quotes = [e for e in elements if e.type == MarkdownElementType.BLOCKQUOTE]
|
|
paragraphs = [e for e in elements if e.type == MarkdownElementType.PARAGRAPH]
|
|
|
|
assert len(headers) == 5
|
|
assert len(code_blocks) == 1
|
|
assert len(tables) == 1
|
|
assert len(lists) == 1
|
|
assert len(quotes) == 1
|
|
assert len(paragraphs) >= 1
|
|
|
|
def test_code_blocks_not_parsed_as_other_elements(self):
|
|
"""Test that code blocks don't get parsed as headers or lists."""
|
|
text = """```markdown
|
|
# This is not a header
|
|
- This is not a list
|
|
| This is not a table |
|
|
```
|
|
"""
|
|
parser = MarkdownParser()
|
|
elements = parser.parse(text)
|
|
|
|
headers = [e for e in elements if e.type == MarkdownElementType.HEADER]
|
|
lists = [e for e in elements if e.type == MarkdownElementType.LIST]
|
|
tables = [e for e in elements if e.type == MarkdownElementType.TABLE]
|
|
code_blocks = [e for e in elements if e.type == MarkdownElementType.CODE_BLOCK]
|
|
|
|
assert len(headers) == 0
|
|
assert len(lists) == 0
|
|
assert len(tables) == 0
|
|
assert len(code_blocks) == 1
|
|
|
|
|
|
class TestMarkdownChunker:
|
|
"""Tests for MarkdownChunker."""
|
|
|
|
def test_chunk_simple_document(self):
|
|
"""Test chunking a simple document."""
|
|
text = """# Title
|
|
|
|
This is a paragraph.
|
|
|
|
## Section
|
|
|
|
Another paragraph.
|
|
"""
|
|
chunker = MarkdownChunker()
|
|
chunks = chunker.chunk(text, "test_doc")
|
|
|
|
assert len(chunks) >= 2
|
|
assert all(isinstance(chunk, MarkdownChunk) for chunk in chunks)
|
|
assert all(chunk.chunk_id.startswith("test_doc") for chunk in chunks)
|
|
|
|
def test_chunk_preserves_header_context(self):
|
|
"""Test that header context is preserved."""
|
|
text = """# Main Title
|
|
|
|
## Section A
|
|
|
|
Content under section A.
|
|
|
|
### Subsection A1
|
|
|
|
Content under subsection A1.
|
|
"""
|
|
chunker = MarkdownChunker(include_header_context=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
subsection_chunks = [c for c in chunks if "Subsection A1" not in c.content]
|
|
for chunk in subsection_chunks:
|
|
if "subsection a1" in chunk.content.lower():
|
|
assert "Main Title" in chunk.header_context
|
|
assert "Section A" in chunk.header_context
|
|
|
|
def test_chunk_code_blocks_preserved(self):
|
|
"""Test that code blocks are preserved as single chunks when possible."""
|
|
text = """```python
|
|
def function_one():
|
|
pass
|
|
|
|
def function_two():
|
|
pass
|
|
```
|
|
"""
|
|
chunker = MarkdownChunker(max_chunk_size=2000, preserve_code_blocks=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK]
|
|
|
|
assert len(code_chunks) == 1
|
|
assert "def function_one" in code_chunks[0].content
|
|
assert "def function_two" in code_chunks[0].content
|
|
assert code_chunks[0].language == "python"
|
|
|
|
def test_chunk_large_code_block_split(self):
|
|
"""Test that large code blocks are split properly."""
|
|
lines = ["def function_{}(): pass".format(i) for i in range(100)]
|
|
code_content = "\n".join(lines)
|
|
text = f"""```python\n{code_content}\n```"""
|
|
|
|
chunker = MarkdownChunker(max_chunk_size=500, preserve_code_blocks=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
code_chunks = [c for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK]
|
|
|
|
assert len(code_chunks) > 1
|
|
for chunk in code_chunks:
|
|
assert chunk.language == "python"
|
|
assert "```python" in chunk.content
|
|
assert "```" in chunk.content
|
|
|
|
def test_chunk_table_preserved(self):
|
|
"""Test that tables are preserved."""
|
|
text = """| Name | Age |
|
|
|------|-----|
|
|
| Alice | 30 |
|
|
| Bob | 25 |
|
|
"""
|
|
chunker = MarkdownChunker(max_chunk_size=2000, preserve_tables=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE]
|
|
|
|
assert len(table_chunks) == 1
|
|
assert "Alice" in table_chunks[0].content
|
|
assert "Bob" in table_chunks[0].content
|
|
|
|
def test_chunk_large_table_split(self):
|
|
"""Test that large tables are split with header preserved."""
|
|
rows = [f"| Name{i} | {i * 10} |" for i in range(50)]
|
|
table_content = "| Name | Age |\n|------|-----|\n" + "\n".join(rows)
|
|
text = table_content
|
|
|
|
chunker = MarkdownChunker(max_chunk_size=200, preserve_tables=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
table_chunks = [c for c in chunks if c.element_type == MarkdownElementType.TABLE]
|
|
|
|
assert len(table_chunks) > 1
|
|
for chunk in table_chunks:
|
|
assert "| Name | Age |" in chunk.content
|
|
assert "|------|-----|" in chunk.content
|
|
|
|
def test_chunk_list_preserved(self):
|
|
"""Test that lists are chunked properly."""
|
|
text = """- Item 1
|
|
- Item 2
|
|
- Item 3
|
|
- Item 4
|
|
- Item 5
|
|
"""
|
|
chunker = MarkdownChunker(max_chunk_size=2000, preserve_lists=True)
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
list_chunks = [c for c in chunks if c.element_type == MarkdownElementType.LIST]
|
|
|
|
assert len(list_chunks) == 1
|
|
assert "Item 1" in list_chunks[0].content
|
|
assert "Item 5" in list_chunks[0].content
|
|
|
|
def test_chunk_empty_document(self):
|
|
"""Test chunking an empty document."""
|
|
text = ""
|
|
chunker = MarkdownChunker()
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
assert len(chunks) == 0
|
|
|
|
def test_chunk_only_headers(self):
|
|
"""Test chunking a document with only headers."""
|
|
text = """# Title 1
|
|
## Title 2
|
|
### Title 3
|
|
"""
|
|
chunker = MarkdownChunker()
|
|
chunks = chunker.chunk(text, "test")
|
|
|
|
assert len(chunks) == 0
|
|
|
|
|
|
class TestChunkMarkdownFunction:
|
|
"""Tests for the convenience chunk_markdown function."""
|
|
|
|
def test_basic_chunking(self):
|
|
"""Test basic chunking via convenience function."""
|
|
text = """# Title
|
|
|
|
Content paragraph.
|
|
|
|
```python
|
|
code = "here"
|
|
```
|
|
"""
|
|
chunks = chunk_markdown(text, "doc1")
|
|
|
|
assert len(chunks) >= 1
|
|
assert all("chunk_id" in chunk for chunk in chunks)
|
|
assert all("content" in chunk for chunk in chunks)
|
|
assert all("element_type" in chunk for chunk in chunks)
|
|
assert all("header_context" in chunk for chunk in chunks)
|
|
|
|
def test_custom_parameters(self):
|
|
"""Test chunking with custom parameters."""
|
|
text = "A" * 2000
|
|
|
|
chunks = chunk_markdown(
|
|
text,
|
|
"doc1",
|
|
max_chunk_size=500,
|
|
min_chunk_size=50,
|
|
preserve_code_blocks=False,
|
|
preserve_tables=False,
|
|
preserve_lists=False,
|
|
include_header_context=False,
|
|
)
|
|
|
|
assert len(chunks) >= 1
|
|
|
|
|
|
class TestMarkdownElement:
|
|
"""Tests for MarkdownElement dataclass."""
|
|
|
|
def test_to_dict(self):
|
|
"""Test serialization to dictionary."""
|
|
elem = MarkdownElement(
|
|
type=MarkdownElementType.HEADER,
|
|
content="Test Header",
|
|
level=2,
|
|
line_start=10,
|
|
line_end=10,
|
|
metadata={"level": 2},
|
|
)
|
|
|
|
result = elem.to_dict()
|
|
|
|
assert result["type"] == "header"
|
|
assert result["content"] == "Test Header"
|
|
assert result["level"] == 2
|
|
assert result["line_start"] == 10
|
|
assert result["line_end"] == 10
|
|
|
|
def test_code_block_with_language(self):
|
|
"""Test code block element with language."""
|
|
elem = MarkdownElement(
|
|
type=MarkdownElementType.CODE_BLOCK,
|
|
content="print('hello')",
|
|
language="python",
|
|
line_start=5,
|
|
line_end=7,
|
|
)
|
|
|
|
result = elem.to_dict()
|
|
|
|
assert result["type"] == "code_block"
|
|
assert result["language"] == "python"
|
|
|
|
def test_table_with_metadata(self):
|
|
"""Test table element with metadata."""
|
|
elem = MarkdownElement(
|
|
type=MarkdownElementType.TABLE,
|
|
content="| A | B |\n|---|---|\n| 1 | 2 |",
|
|
line_start=1,
|
|
line_end=3,
|
|
metadata={"headers": ["A", "B"], "row_count": 1},
|
|
)
|
|
|
|
result = elem.to_dict()
|
|
|
|
assert result["type"] == "table"
|
|
assert result["metadata"]["headers"] == ["A", "B"]
|
|
assert result["metadata"]["row_count"] == 1
|
|
|
|
|
|
class TestMarkdownChunk:
|
|
"""Tests for MarkdownChunk dataclass."""
|
|
|
|
def test_to_dict(self):
|
|
"""Test serialization to dictionary."""
|
|
chunk = MarkdownChunk(
|
|
chunk_id="doc_chunk_0",
|
|
content="Test content",
|
|
element_type=MarkdownElementType.PARAGRAPH,
|
|
header_context=["Main Title", "Section"],
|
|
metadata={"key": "value"},
|
|
)
|
|
|
|
result = chunk.to_dict()
|
|
|
|
assert result["chunk_id"] == "doc_chunk_0"
|
|
assert result["content"] == "Test content"
|
|
assert result["element_type"] == "paragraph"
|
|
assert result["header_context"] == ["Main Title", "Section"]
|
|
assert result["metadata"]["key"] == "value"
|
|
|
|
def test_with_language(self):
|
|
"""Test chunk with language info."""
|
|
chunk = MarkdownChunk(
|
|
chunk_id="code_0",
|
|
content="```python\nprint('hi')\n```",
|
|
element_type=MarkdownElementType.CODE_BLOCK,
|
|
header_context=[],
|
|
language="python",
|
|
)
|
|
|
|
result = chunk.to_dict()
|
|
|
|
assert result["language"] == "python"
|
|
|
|
|
|
class TestMarkdownElementType:
|
|
"""Tests for MarkdownElementType enum."""
|
|
|
|
def test_all_types_exist(self):
|
|
"""Test that all expected element types exist."""
|
|
expected_types = [
|
|
"header",
|
|
"paragraph",
|
|
"code_block",
|
|
"inline_code",
|
|
"table",
|
|
"list",
|
|
"blockquote",
|
|
"horizontal_rule",
|
|
"image",
|
|
"link",
|
|
"text",
|
|
]
|
|
|
|
for type_name in expected_types:
|
|
assert hasattr(MarkdownElementType, type_name.upper()) or \
|
|
any(t.value == type_name for t in MarkdownElementType)
|