Coverage for src / qdrant_loader / config / chunking.py: 100%

63 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:40 +0000

1"""Configuration for text chunking.""" 

2 

3from pydantic import BaseModel, Field, ValidationInfo, field_validator 

4 

5 

6class DefaultStrategyConfig(BaseModel): 

7 """Configuration for default text chunking strategy.""" 

8 

9 min_chunk_size: int = Field( 

10 default=100, description="Minimum chunk size in characters", gt=0 

11 ) 

12 enable_entity_extraction: bool = Field( 

13 default=True, description="Enable entity extraction from text" 

14 ) 

15 

16 

17class HtmlStrategyConfig(BaseModel): 

18 """Configuration for HTML chunking strategy.""" 

19 

20 simple_parsing_threshold: int = Field( 

21 default=100000, 

22 description="Size threshold for simple vs complex HTML parsing", 

23 gt=0, 

24 ) 

25 max_html_size_for_parsing: int = Field( 

26 default=500000, 

27 description="Maximum HTML size for complex parsing (bytes)", 

28 gt=0, 

29 ) 

30 max_sections_to_process: int = Field( 

31 default=200, description="Maximum number of sections to process", gt=0 

32 ) 

33 max_chunk_size_for_nlp: int = Field( 

34 default=20000, 

35 description="Maximum chunk size for NLP processing (characters)", 

36 gt=0, 

37 ) 

38 preserve_semantic_structure: bool = Field( 

39 default=True, description="Preserve HTML semantic structure in chunks" 

40 ) 

41 

42 

43class CodeStrategyConfig(BaseModel): 

44 """Configuration for code chunking strategy.""" 

45 

46 max_file_size_for_ast: int = Field( 

47 default=75000, 

48 description="Maximum file size for AST parsing (characters)", 

49 gt=0, 

50 ) 

51 max_elements_to_process: int = Field( 

52 default=800, description="Maximum number of code elements to process", gt=0 

53 ) 

54 max_recursion_depth: int = Field( 

55 default=8, description="Maximum AST recursion depth", gt=0 

56 ) 

57 max_element_size: int = Field( 

58 default=20000, 

59 description="Maximum size for individual code elements (characters)", 

60 gt=0, 

61 ) 

62 enable_ast_parsing: bool = Field( 

63 default=True, description="Enable AST parsing for code analysis" 

64 ) 

65 enable_dependency_analysis: bool = Field( 

66 default=True, description="Enable dependency analysis for code" 

67 ) 

68 

69 

70class JsonStrategyConfig(BaseModel): 

71 """Configuration for JSON chunking strategy.""" 

72 

73 max_json_size_for_parsing: int = Field( 

74 default=1000000, description="Maximum JSON size for parsing (bytes)", gt=0 

75 ) 

76 max_objects_to_process: int = Field( 

77 default=200, description="Maximum number of JSON objects to process", gt=0 

78 ) 

79 max_chunk_size_for_nlp: int = Field( 

80 default=20000, 

81 description="Maximum chunk size for NLP processing (characters)", 

82 gt=0, 

83 ) 

84 max_recursion_depth: int = Field( 

85 default=5, description="Maximum recursion depth for nested structures", gt=0 

86 ) 

87 max_array_items_per_chunk: int = Field( 

88 default=50, description="Maximum array items to include per chunk", gt=0 

89 ) 

90 max_object_keys_to_process: int = Field( 

91 default=100, description="Maximum object keys to process", gt=0 

92 ) 

93 enable_schema_inference: bool = Field( 

94 default=True, description="Enable JSON schema inference" 

95 ) 

96 

97 

98class MarkdownStrategyConfig(BaseModel): 

99 """Configuration for Markdown chunking strategy.""" 

100 

101 min_content_length_for_nlp: int = Field( 

102 default=100, 

103 description="Minimum content length for NLP processing (characters)", 

104 gt=0, 

105 ) 

106 min_word_count_for_nlp: int = Field( 

107 default=20, description="Minimum word count for NLP processing", gt=0 

108 ) 

109 min_line_count_for_nlp: int = Field( 

110 default=3, description="Minimum line count for NLP processing", gt=0 

111 ) 

112 min_section_size: int = Field( 

113 default=500, description="Minimum characters for a standalone section", gt=0 

114 ) 

115 max_chunks_per_section: int = Field( 

116 default=1000, 

117 description="Maximum chunks per section (prevents runaway chunking)", 

118 gt=0, 

119 ) 

120 max_overlap_percentage: float = Field( 

121 default=0.25, 

122 description="Maximum overlap between chunks as percentage (0.25 = 25%)", 

123 ge=0.0, 

124 le=1.0, 

125 ) 

126 max_workers: int = Field( 

127 default=4, description="Maximum worker threads for parallel processing", gt=0 

128 ) 

129 estimation_buffer: float = Field( 

130 default=0.2, 

131 description="Buffer factor for chunk count estimation (0.2 = 20%)", 

132 ge=0.0, 

133 le=1.0, 

134 ) 

135 words_per_minute_reading: int = Field( 

136 default=200, description="Words per minute for reading time estimation", gt=0 

137 ) 

138 header_analysis_threshold_h1: int = Field( 

139 default=3, 

140 description="H1 header count threshold for split level decisions", 

141 gt=0, 

142 ) 

143 header_analysis_threshold_h3: int = Field( 

144 default=8, 

145 description="H3 header count threshold for split level decisions", 

146 gt=0, 

147 ) 

148 enable_hierarchical_metadata: bool = Field( 

149 default=True, description="Enable extraction of hierarchical section metadata" 

150 ) 

151 

152 

153class StrategySpecificConfig(BaseModel): 

154 """Strategy-specific configuration settings.""" 

155 

156 default: DefaultStrategyConfig = Field( 

157 default_factory=DefaultStrategyConfig, 

158 description="Configuration for default text chunking strategy", 

159 ) 

160 html: HtmlStrategyConfig = Field( 

161 default_factory=HtmlStrategyConfig, 

162 description="Configuration for HTML chunking strategy", 

163 ) 

164 code: CodeStrategyConfig = Field( 

165 default_factory=CodeStrategyConfig, 

166 description="Configuration for code chunking strategy", 

167 ) 

168 json_strategy: JsonStrategyConfig = Field( 

169 default_factory=JsonStrategyConfig, 

170 description="Configuration for JSON chunking strategy", 

171 alias="json", 

172 ) 

173 markdown: MarkdownStrategyConfig = Field( 

174 default_factory=MarkdownStrategyConfig, 

175 description="Configuration for Markdown chunking strategy", 

176 ) 

177 

178 

179class ChunkingConfig(BaseModel): 

180 """Configuration for text chunking.""" 

181 

182 chunk_size: int = Field( 

183 default=1500, 

184 description="Size of text chunks in characters", 

185 gt=0, 

186 title="Chunk Size", 

187 ) 

188 chunk_overlap: int = Field( 

189 default=200, 

190 description="Overlap between chunks in characters", 

191 ge=0, 

192 title="Chunk Overlap", 

193 ) 

194 max_chunks_per_document: int = Field( 

195 default=500, 

196 description="Maximum number of chunks per document (safety limit)", 

197 gt=0, 

198 title="Max Chunks Per Document", 

199 ) 

200 enable_semantic_analysis: bool = Field( 

201 default=True, 

202 description="Master switch for semantic analysis (spaCy + LDA) across all chunking strategies. " 

203 "Disable for faster ingestion when NLP enrichment is not needed.", 

204 ) 

205 enable_enhanced_semantic_analysis: bool = Field( 

206 default=False, 

207 description="Enable advanced NLP fields: pos_tags, dependencies, document_similarity. " 

208 "Requires enable_semantic_analysis=true. " 

209 "Increases payload size and ingestion time.", 

210 ) 

211 

212 # Strategy-specific configurations 

213 strategies: StrategySpecificConfig = Field( 

214 default_factory=StrategySpecificConfig, 

215 description="Strategy-specific configuration settings", 

216 ) 

217 

218 @field_validator("chunk_overlap") 

219 def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int: 

220 """Validate that chunk overlap is less than chunk size.""" 

221 chunk_size = info.data.get("chunk_size", 1500) 

222 if v >= chunk_size: 

223 raise ValueError("Chunk overlap must be less than chunk size") 

224 return v 

225 

226 @field_validator("enable_enhanced_semantic_analysis") 

227 def validate_enhanced_semantic_analysis_dependency( 

228 cls, v: bool, info: ValidationInfo 

229 ) -> bool: 

230 """Validate enhanced semantic analysis requires base semantic analysis.""" 

231 enable_semantic_analysis = info.data.get("enable_semantic_analysis", True) 

232 if v and enable_semantic_analysis is not True: 

233 raise ValueError( 

234 "enable_enhanced_semantic_analysis requires enable_semantic_analysis=True" 

235 ) 

236 return v