Coverage for src / qdrant_loader / config / chunking.py: 100%
63 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
1"""Configuration for text chunking."""
3from pydantic import BaseModel, Field, ValidationInfo, field_validator
6class DefaultStrategyConfig(BaseModel):
7 """Configuration for default text chunking strategy."""
9 min_chunk_size: int = Field(
10 default=100, description="Minimum chunk size in characters", gt=0
11 )
12 enable_entity_extraction: bool = Field(
13 default=True, description="Enable entity extraction from text"
14 )
17class HtmlStrategyConfig(BaseModel):
18 """Configuration for HTML chunking strategy."""
20 simple_parsing_threshold: int = Field(
21 default=100000,
22 description="Size threshold for simple vs complex HTML parsing",
23 gt=0,
24 )
25 max_html_size_for_parsing: int = Field(
26 default=500000,
27 description="Maximum HTML size for complex parsing (bytes)",
28 gt=0,
29 )
30 max_sections_to_process: int = Field(
31 default=200, description="Maximum number of sections to process", gt=0
32 )
33 max_chunk_size_for_nlp: int = Field(
34 default=20000,
35 description="Maximum chunk size for NLP processing (characters)",
36 gt=0,
37 )
38 preserve_semantic_structure: bool = Field(
39 default=True, description="Preserve HTML semantic structure in chunks"
40 )
43class CodeStrategyConfig(BaseModel):
44 """Configuration for code chunking strategy."""
46 max_file_size_for_ast: int = Field(
47 default=75000,
48 description="Maximum file size for AST parsing (characters)",
49 gt=0,
50 )
51 max_elements_to_process: int = Field(
52 default=800, description="Maximum number of code elements to process", gt=0
53 )
54 max_recursion_depth: int = Field(
55 default=8, description="Maximum AST recursion depth", gt=0
56 )
57 max_element_size: int = Field(
58 default=20000,
59 description="Maximum size for individual code elements (characters)",
60 gt=0,
61 )
62 enable_ast_parsing: bool = Field(
63 default=True, description="Enable AST parsing for code analysis"
64 )
65 enable_dependency_analysis: bool = Field(
66 default=True, description="Enable dependency analysis for code"
67 )
70class JsonStrategyConfig(BaseModel):
71 """Configuration for JSON chunking strategy."""
73 max_json_size_for_parsing: int = Field(
74 default=1000000, description="Maximum JSON size for parsing (bytes)", gt=0
75 )
76 max_objects_to_process: int = Field(
77 default=200, description="Maximum number of JSON objects to process", gt=0
78 )
79 max_chunk_size_for_nlp: int = Field(
80 default=20000,
81 description="Maximum chunk size for NLP processing (characters)",
82 gt=0,
83 )
84 max_recursion_depth: int = Field(
85 default=5, description="Maximum recursion depth for nested structures", gt=0
86 )
87 max_array_items_per_chunk: int = Field(
88 default=50, description="Maximum array items to include per chunk", gt=0
89 )
90 max_object_keys_to_process: int = Field(
91 default=100, description="Maximum object keys to process", gt=0
92 )
93 enable_schema_inference: bool = Field(
94 default=True, description="Enable JSON schema inference"
95 )
98class MarkdownStrategyConfig(BaseModel):
99 """Configuration for Markdown chunking strategy."""
101 min_content_length_for_nlp: int = Field(
102 default=100,
103 description="Minimum content length for NLP processing (characters)",
104 gt=0,
105 )
106 min_word_count_for_nlp: int = Field(
107 default=20, description="Minimum word count for NLP processing", gt=0
108 )
109 min_line_count_for_nlp: int = Field(
110 default=3, description="Minimum line count for NLP processing", gt=0
111 )
112 min_section_size: int = Field(
113 default=500, description="Minimum characters for a standalone section", gt=0
114 )
115 max_chunks_per_section: int = Field(
116 default=1000,
117 description="Maximum chunks per section (prevents runaway chunking)",
118 gt=0,
119 )
120 max_overlap_percentage: float = Field(
121 default=0.25,
122 description="Maximum overlap between chunks as percentage (0.25 = 25%)",
123 ge=0.0,
124 le=1.0,
125 )
126 max_workers: int = Field(
127 default=4, description="Maximum worker threads for parallel processing", gt=0
128 )
129 estimation_buffer: float = Field(
130 default=0.2,
131 description="Buffer factor for chunk count estimation (0.2 = 20%)",
132 ge=0.0,
133 le=1.0,
134 )
135 words_per_minute_reading: int = Field(
136 default=200, description="Words per minute for reading time estimation", gt=0
137 )
138 header_analysis_threshold_h1: int = Field(
139 default=3,
140 description="H1 header count threshold for split level decisions",
141 gt=0,
142 )
143 header_analysis_threshold_h3: int = Field(
144 default=8,
145 description="H3 header count threshold for split level decisions",
146 gt=0,
147 )
148 enable_hierarchical_metadata: bool = Field(
149 default=True, description="Enable extraction of hierarchical section metadata"
150 )
153class StrategySpecificConfig(BaseModel):
154 """Strategy-specific configuration settings."""
156 default: DefaultStrategyConfig = Field(
157 default_factory=DefaultStrategyConfig,
158 description="Configuration for default text chunking strategy",
159 )
160 html: HtmlStrategyConfig = Field(
161 default_factory=HtmlStrategyConfig,
162 description="Configuration for HTML chunking strategy",
163 )
164 code: CodeStrategyConfig = Field(
165 default_factory=CodeStrategyConfig,
166 description="Configuration for code chunking strategy",
167 )
168 json_strategy: JsonStrategyConfig = Field(
169 default_factory=JsonStrategyConfig,
170 description="Configuration for JSON chunking strategy",
171 alias="json",
172 )
173 markdown: MarkdownStrategyConfig = Field(
174 default_factory=MarkdownStrategyConfig,
175 description="Configuration for Markdown chunking strategy",
176 )
179class ChunkingConfig(BaseModel):
180 """Configuration for text chunking."""
182 chunk_size: int = Field(
183 default=1500,
184 description="Size of text chunks in characters",
185 gt=0,
186 title="Chunk Size",
187 )
188 chunk_overlap: int = Field(
189 default=200,
190 description="Overlap between chunks in characters",
191 ge=0,
192 title="Chunk Overlap",
193 )
194 max_chunks_per_document: int = Field(
195 default=500,
196 description="Maximum number of chunks per document (safety limit)",
197 gt=0,
198 title="Max Chunks Per Document",
199 )
200 enable_semantic_analysis: bool = Field(
201 default=True,
202 description="Master switch for semantic analysis (spaCy + LDA) across all chunking strategies. "
203 "Disable for faster ingestion when NLP enrichment is not needed.",
204 )
205 enable_enhanced_semantic_analysis: bool = Field(
206 default=False,
207 description="Enable advanced NLP fields: pos_tags, dependencies, document_similarity. "
208 "Requires enable_semantic_analysis=true. "
209 "Increases payload size and ingestion time.",
210 )
212 # Strategy-specific configurations
213 strategies: StrategySpecificConfig = Field(
214 default_factory=StrategySpecificConfig,
215 description="Strategy-specific configuration settings",
216 )
218 @field_validator("chunk_overlap")
219 def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int:
220 """Validate that chunk overlap is less than chunk size."""
221 chunk_size = info.data.get("chunk_size", 1500)
222 if v >= chunk_size:
223 raise ValueError("Chunk overlap must be less than chunk size")
224 return v
226 @field_validator("enable_enhanced_semantic_analysis")
227 def validate_enhanced_semantic_analysis_dependency(
228 cls, v: bool, info: ValidationInfo
229 ) -> bool:
230 """Validate enhanced semantic analysis requires base semantic analysis."""
231 enable_semantic_analysis = info.data.get("enable_semantic_analysis", True)
232 if v and enable_semantic_analysis is not True:
233 raise ValueError(
234 "enable_enhanced_semantic_analysis requires enable_semantic_analysis=True"
235 )
236 return v