Coverage for src/qdrant_loader/config/chunking.py: 100%
56 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Configuration for text chunking."""
3from pydantic import BaseModel, Field, ValidationInfo, field_validator
6class DefaultStrategyConfig(BaseModel):
7 """Configuration for default text chunking strategy."""
9 min_chunk_size: int = Field(
10 default=100, description="Minimum chunk size in characters", gt=0
11 )
12 enable_semantic_analysis: bool = Field(
13 default=True, description="Enable semantic analysis for text chunks"
14 )
15 enable_entity_extraction: bool = Field(
16 default=True, description="Enable entity extraction from text"
17 )
20class HtmlStrategyConfig(BaseModel):
21 """Configuration for HTML chunking strategy."""
23 simple_parsing_threshold: int = Field(
24 default=100000,
25 description="Size threshold for simple vs complex HTML parsing",
26 gt=0,
27 )
28 max_html_size_for_parsing: int = Field(
29 default=500000,
30 description="Maximum HTML size for complex parsing (bytes)",
31 gt=0,
32 )
33 max_sections_to_process: int = Field(
34 default=200, description="Maximum number of sections to process", gt=0
35 )
36 max_chunk_size_for_nlp: int = Field(
37 default=20000,
38 description="Maximum chunk size for NLP processing (characters)",
39 gt=0,
40 )
41 preserve_semantic_structure: bool = Field(
42 default=True, description="Preserve HTML semantic structure in chunks"
43 )
46class CodeStrategyConfig(BaseModel):
47 """Configuration for code chunking strategy."""
49 max_file_size_for_ast: int = Field(
50 default=75000,
51 description="Maximum file size for AST parsing (characters)",
52 gt=0,
53 )
54 max_elements_to_process: int = Field(
55 default=800, description="Maximum number of code elements to process", gt=0
56 )
57 max_recursion_depth: int = Field(
58 default=8, description="Maximum AST recursion depth", gt=0
59 )
60 max_element_size: int = Field(
61 default=20000,
62 description="Maximum size for individual code elements (characters)",
63 gt=0,
64 )
65 enable_ast_parsing: bool = Field(
66 default=True, description="Enable AST parsing for code analysis"
67 )
68 enable_dependency_analysis: bool = Field(
69 default=True, description="Enable dependency analysis for code"
70 )
73class JsonStrategyConfig(BaseModel):
74 """Configuration for JSON chunking strategy."""
76 max_json_size_for_parsing: int = Field(
77 default=1000000, description="Maximum JSON size for parsing (bytes)", gt=0
78 )
79 max_objects_to_process: int = Field(
80 default=200, description="Maximum number of JSON objects to process", gt=0
81 )
82 max_chunk_size_for_nlp: int = Field(
83 default=20000,
84 description="Maximum chunk size for NLP processing (characters)",
85 gt=0,
86 )
87 max_recursion_depth: int = Field(
88 default=5, description="Maximum recursion depth for nested structures", gt=0
89 )
90 max_array_items_per_chunk: int = Field(
91 default=50, description="Maximum array items to include per chunk", gt=0
92 )
93 max_object_keys_to_process: int = Field(
94 default=100, description="Maximum object keys to process", gt=0
95 )
96 enable_schema_inference: bool = Field(
97 default=True, description="Enable JSON schema inference"
98 )
101class MarkdownStrategyConfig(BaseModel):
102 """Configuration for Markdown chunking strategy."""
104 min_content_length_for_nlp: int = Field(
105 default=100,
106 description="Minimum content length for NLP processing (characters)",
107 gt=0,
108 )
109 min_word_count_for_nlp: int = Field(
110 default=20, description="Minimum word count for NLP processing", gt=0
111 )
112 min_line_count_for_nlp: int = Field(
113 default=3, description="Minimum line count for NLP processing", gt=0
114 )
115 min_section_size: int = Field(
116 default=500, description="Minimum characters for a standalone section", gt=0
117 )
118 max_chunks_per_section: int = Field(
119 default=1000,
120 description="Maximum chunks per section (prevents runaway chunking)",
121 gt=0,
122 )
123 max_overlap_percentage: float = Field(
124 default=0.25,
125 description="Maximum overlap between chunks as percentage (0.25 = 25%)",
126 ge=0.0,
127 le=1.0,
128 )
129 max_workers: int = Field(
130 default=4, description="Maximum worker threads for parallel processing", gt=0
131 )
132 estimation_buffer: float = Field(
133 default=0.2,
134 description="Buffer factor for chunk count estimation (0.2 = 20%)",
135 ge=0.0,
136 le=1.0,
137 )
138 words_per_minute_reading: int = Field(
139 default=200, description="Words per minute for reading time estimation", gt=0
140 )
141 header_analysis_threshold_h1: int = Field(
142 default=3,
143 description="H1 header count threshold for split level decisions",
144 gt=0,
145 )
146 header_analysis_threshold_h3: int = Field(
147 default=8,
148 description="H3 header count threshold for split level decisions",
149 gt=0,
150 )
151 enable_hierarchical_metadata: bool = Field(
152 default=True, description="Enable extraction of hierarchical section metadata"
153 )
156class StrategySpecificConfig(BaseModel):
157 """Strategy-specific configuration settings."""
159 default: DefaultStrategyConfig = Field(
160 default_factory=DefaultStrategyConfig,
161 description="Configuration for default text chunking strategy",
162 )
163 html: HtmlStrategyConfig = Field(
164 default_factory=HtmlStrategyConfig,
165 description="Configuration for HTML chunking strategy",
166 )
167 code: CodeStrategyConfig = Field(
168 default_factory=CodeStrategyConfig,
169 description="Configuration for code chunking strategy",
170 )
171 json_strategy: JsonStrategyConfig = Field(
172 default_factory=JsonStrategyConfig,
173 description="Configuration for JSON chunking strategy",
174 alias="json",
175 )
176 markdown: MarkdownStrategyConfig = Field(
177 default_factory=MarkdownStrategyConfig,
178 description="Configuration for Markdown chunking strategy",
179 )
182class ChunkingConfig(BaseModel):
183 """Configuration for text chunking."""
185 chunk_size: int = Field(
186 default=1500,
187 description="Size of text chunks in characters",
188 gt=0,
189 title="Chunk Size",
190 )
191 chunk_overlap: int = Field(
192 default=200,
193 description="Overlap between chunks in characters",
194 ge=0,
195 title="Chunk Overlap",
196 )
197 max_chunks_per_document: int = Field(
198 default=500,
199 description="Maximum number of chunks per document (safety limit)",
200 gt=0,
201 title="Max Chunks Per Document",
202 )
204 # Strategy-specific configurations
205 strategies: StrategySpecificConfig = Field(
206 default_factory=StrategySpecificConfig,
207 description="Strategy-specific configuration settings",
208 )
210 @field_validator("chunk_overlap")
211 def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int:
212 """Validate that chunk overlap is less than chunk size."""
213 chunk_size = info.data.get("chunk_size", 1500)
214 if v >= chunk_size:
215 raise ValueError("Chunk overlap must be less than chunk size")
216 return v