Coverage for src/qdrant_loader/config/chunking.py: 100%

56 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Configuration for text chunking.""" 

2 

3from pydantic import BaseModel, Field, ValidationInfo, field_validator 

4 

5 

6class DefaultStrategyConfig(BaseModel): 

7 """Configuration for default text chunking strategy.""" 

8 

9 min_chunk_size: int = Field( 

10 default=100, description="Minimum chunk size in characters", gt=0 

11 ) 

12 enable_semantic_analysis: bool = Field( 

13 default=True, description="Enable semantic analysis for text chunks" 

14 ) 

15 enable_entity_extraction: bool = Field( 

16 default=True, description="Enable entity extraction from text" 

17 ) 

18 

19 

20class HtmlStrategyConfig(BaseModel): 

21 """Configuration for HTML chunking strategy.""" 

22 

23 simple_parsing_threshold: int = Field( 

24 default=100000, 

25 description="Size threshold for simple vs complex HTML parsing", 

26 gt=0, 

27 ) 

28 max_html_size_for_parsing: int = Field( 

29 default=500000, 

30 description="Maximum HTML size for complex parsing (bytes)", 

31 gt=0, 

32 ) 

33 max_sections_to_process: int = Field( 

34 default=200, description="Maximum number of sections to process", gt=0 

35 ) 

36 max_chunk_size_for_nlp: int = Field( 

37 default=20000, 

38 description="Maximum chunk size for NLP processing (characters)", 

39 gt=0, 

40 ) 

41 preserve_semantic_structure: bool = Field( 

42 default=True, description="Preserve HTML semantic structure in chunks" 

43 ) 

44 

45 

46class CodeStrategyConfig(BaseModel): 

47 """Configuration for code chunking strategy.""" 

48 

49 max_file_size_for_ast: int = Field( 

50 default=75000, 

51 description="Maximum file size for AST parsing (characters)", 

52 gt=0, 

53 ) 

54 max_elements_to_process: int = Field( 

55 default=800, description="Maximum number of code elements to process", gt=0 

56 ) 

57 max_recursion_depth: int = Field( 

58 default=8, description="Maximum AST recursion depth", gt=0 

59 ) 

60 max_element_size: int = Field( 

61 default=20000, 

62 description="Maximum size for individual code elements (characters)", 

63 gt=0, 

64 ) 

65 enable_ast_parsing: bool = Field( 

66 default=True, description="Enable AST parsing for code analysis" 

67 ) 

68 enable_dependency_analysis: bool = Field( 

69 default=True, description="Enable dependency analysis for code" 

70 ) 

71 

72 

73class JsonStrategyConfig(BaseModel): 

74 """Configuration for JSON chunking strategy.""" 

75 

76 max_json_size_for_parsing: int = Field( 

77 default=1000000, description="Maximum JSON size for parsing (bytes)", gt=0 

78 ) 

79 max_objects_to_process: int = Field( 

80 default=200, description="Maximum number of JSON objects to process", gt=0 

81 ) 

82 max_chunk_size_for_nlp: int = Field( 

83 default=20000, 

84 description="Maximum chunk size for NLP processing (characters)", 

85 gt=0, 

86 ) 

87 max_recursion_depth: int = Field( 

88 default=5, description="Maximum recursion depth for nested structures", gt=0 

89 ) 

90 max_array_items_per_chunk: int = Field( 

91 default=50, description="Maximum array items to include per chunk", gt=0 

92 ) 

93 max_object_keys_to_process: int = Field( 

94 default=100, description="Maximum object keys to process", gt=0 

95 ) 

96 enable_schema_inference: bool = Field( 

97 default=True, description="Enable JSON schema inference" 

98 ) 

99 

100 

101class MarkdownStrategyConfig(BaseModel): 

102 """Configuration for Markdown chunking strategy.""" 

103 

104 min_content_length_for_nlp: int = Field( 

105 default=100, 

106 description="Minimum content length for NLP processing (characters)", 

107 gt=0, 

108 ) 

109 min_word_count_for_nlp: int = Field( 

110 default=20, description="Minimum word count for NLP processing", gt=0 

111 ) 

112 min_line_count_for_nlp: int = Field( 

113 default=3, description="Minimum line count for NLP processing", gt=0 

114 ) 

115 min_section_size: int = Field( 

116 default=500, description="Minimum characters for a standalone section", gt=0 

117 ) 

118 max_chunks_per_section: int = Field( 

119 default=1000, 

120 description="Maximum chunks per section (prevents runaway chunking)", 

121 gt=0, 

122 ) 

123 max_overlap_percentage: float = Field( 

124 default=0.25, 

125 description="Maximum overlap between chunks as percentage (0.25 = 25%)", 

126 ge=0.0, 

127 le=1.0, 

128 ) 

129 max_workers: int = Field( 

130 default=4, description="Maximum worker threads for parallel processing", gt=0 

131 ) 

132 estimation_buffer: float = Field( 

133 default=0.2, 

134 description="Buffer factor for chunk count estimation (0.2 = 20%)", 

135 ge=0.0, 

136 le=1.0, 

137 ) 

138 words_per_minute_reading: int = Field( 

139 default=200, description="Words per minute for reading time estimation", gt=0 

140 ) 

141 header_analysis_threshold_h1: int = Field( 

142 default=3, 

143 description="H1 header count threshold for split level decisions", 

144 gt=0, 

145 ) 

146 header_analysis_threshold_h3: int = Field( 

147 default=8, 

148 description="H3 header count threshold for split level decisions", 

149 gt=0, 

150 ) 

151 enable_hierarchical_metadata: bool = Field( 

152 default=True, description="Enable extraction of hierarchical section metadata" 

153 ) 

154 

155 

156class StrategySpecificConfig(BaseModel): 

157 """Strategy-specific configuration settings.""" 

158 

159 default: DefaultStrategyConfig = Field( 

160 default_factory=DefaultStrategyConfig, 

161 description="Configuration for default text chunking strategy", 

162 ) 

163 html: HtmlStrategyConfig = Field( 

164 default_factory=HtmlStrategyConfig, 

165 description="Configuration for HTML chunking strategy", 

166 ) 

167 code: CodeStrategyConfig = Field( 

168 default_factory=CodeStrategyConfig, 

169 description="Configuration for code chunking strategy", 

170 ) 

171 json_strategy: JsonStrategyConfig = Field( 

172 default_factory=JsonStrategyConfig, 

173 description="Configuration for JSON chunking strategy", 

174 alias="json", 

175 ) 

176 markdown: MarkdownStrategyConfig = Field( 

177 default_factory=MarkdownStrategyConfig, 

178 description="Configuration for Markdown chunking strategy", 

179 ) 

180 

181 

182class ChunkingConfig(BaseModel): 

183 """Configuration for text chunking.""" 

184 

185 chunk_size: int = Field( 

186 default=1500, 

187 description="Size of text chunks in characters", 

188 gt=0, 

189 title="Chunk Size", 

190 ) 

191 chunk_overlap: int = Field( 

192 default=200, 

193 description="Overlap between chunks in characters", 

194 ge=0, 

195 title="Chunk Overlap", 

196 ) 

197 max_chunks_per_document: int = Field( 

198 default=500, 

199 description="Maximum number of chunks per document (safety limit)", 

200 gt=0, 

201 title="Max Chunks Per Document", 

202 ) 

203 

204 # Strategy-specific configurations 

205 strategies: StrategySpecificConfig = Field( 

206 default_factory=StrategySpecificConfig, 

207 description="Strategy-specific configuration settings", 

208 ) 

209 

210 @field_validator("chunk_overlap") 

211 def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int: 

212 """Validate that chunk overlap is less than chunk size.""" 

213 chunk_size = info.data.get("chunk_size", 1500) 

214 if v >= chunk_size: 

215 raise ValueError("Chunk overlap must be less than chunk size") 

216 return v