Coverage for src/qdrant_loader/core/chunking/chunking_service.py: 100%

60 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Service for chunking documents.""" 

2 

3from pathlib import Path 

4 

5from qdrant_loader.config import GlobalConfig, Settings 

6from qdrant_loader.core.chunking.strategy import ( 

7 BaseChunkingStrategy, 

8 CodeChunkingStrategy, 

9 DefaultChunkingStrategy, 

10 HTMLChunkingStrategy, 

11 JSONChunkingStrategy, 

12 MarkdownChunkingStrategy, 

13) 

14from qdrant_loader.core.document import Document 

15from qdrant_loader.core.monitoring.ingestion_metrics import IngestionMonitor 

16from qdrant_loader.utils.logging import LoggingConfig 

17 

18 

19class ChunkingService: 

20 """Service for chunking documents into smaller pieces.""" 

21 

22 def __new__(cls, config: GlobalConfig, settings: Settings): 

23 """Create a new instance of ChunkingService. 

24 

25 Args: 

26 config: Global configuration 

27 settings: Application settings 

28 """ 

29 instance = super().__new__(cls) 

30 instance.__init__(config, settings) 

31 return instance 

32 

33 def __init__(self, config: GlobalConfig, settings: Settings): 

34 """Initialize the chunking service. 

35 

36 Args: 

37 config: Global configuration 

38 settings: Application settings 

39 """ 

40 self.config = config 

41 self.settings = settings 

42 self.validate_config() 

43 self.logger = LoggingConfig.get_logger(__name__) 

44 

45 # Initialize metrics directory 

46 metrics_dir = Path.cwd() / "metrics" 

47 metrics_dir.mkdir(parents=True, exist_ok=True) 

48 self.monitor = IngestionMonitor(str(metrics_dir.absolute())) 

49 

50 # Initialize strategies 

51 self.strategies: dict[str, type[BaseChunkingStrategy]] = { 

52 "md": MarkdownChunkingStrategy, 

53 "html": HTMLChunkingStrategy, 

54 # JSON files 

55 "json": JSONChunkingStrategy, 

56 # Programming languages 

57 "py": CodeChunkingStrategy, 

58 "java": CodeChunkingStrategy, 

59 "js": CodeChunkingStrategy, 

60 "ts": CodeChunkingStrategy, 

61 "go": CodeChunkingStrategy, 

62 "rs": CodeChunkingStrategy, 

63 "cpp": CodeChunkingStrategy, 

64 "c": CodeChunkingStrategy, 

65 "cs": CodeChunkingStrategy, 

66 "php": CodeChunkingStrategy, 

67 "rb": CodeChunkingStrategy, 

68 "kt": CodeChunkingStrategy, 

69 "swift": CodeChunkingStrategy, 

70 "scala": CodeChunkingStrategy, 

71 # Add more strategies here as needed 

72 } 

73 

74 # Default strategy for unknown file types 

75 self.default_strategy = DefaultChunkingStrategy(settings=self.settings) 

76 

77 def validate_config(self) -> None: 

78 """Validate the configuration. 

79 

80 Raises: 

81 ValueError: If chunk size or overlap parameters are invalid. 

82 """ 

83 if self.config.chunking.chunk_size <= 0: 

84 raise ValueError("Chunk size must be greater than 0") 

85 if self.config.chunking.chunk_overlap < 0: 

86 raise ValueError("Chunk overlap must be non-negative") 

87 if self.config.chunking.chunk_overlap >= self.config.chunking.chunk_size: 

88 raise ValueError("Chunk overlap must be less than chunk size") 

89 

90 def _get_strategy(self, document: Document) -> BaseChunkingStrategy: 

91 """Get the appropriate chunking strategy for a document. 

92 

93 Args: 

94 document: The document to chunk 

95 

96 Returns: 

97 The appropriate chunking strategy for the document type 

98 """ 

99 # Check if this is a converted file 

100 conversion_method = document.metadata.get("conversion_method") 

101 if conversion_method == "markitdown": 

102 # Files converted with MarkItDown are now in markdown format 

103 self.logger.info( 

104 "Using markdown strategy for converted file", 

105 original_file_type=document.metadata.get("original_file_type"), 

106 conversion_method=conversion_method, 

107 document_id=document.id, 

108 document_title=document.title, 

109 ) 

110 return MarkdownChunkingStrategy(self.settings) 

111 elif conversion_method == "markitdown_fallback": 

112 # Fallback documents are also in markdown format 

113 self.logger.info( 

114 "Using markdown strategy for fallback converted file", 

115 original_file_type=document.metadata.get("original_file_type"), 

116 conversion_method=conversion_method, 

117 conversion_failed=document.metadata.get("conversion_failed", False), 

118 document_id=document.id, 

119 document_title=document.title, 

120 ) 

121 return MarkdownChunkingStrategy(self.settings) 

122 

123 # Get file extension from the document content type 

124 file_type = document.content_type.lower() 

125 

126 self.logger.debug( 

127 "Selecting chunking strategy", 

128 file_type=file_type, 

129 available_strategies=list(self.strategies.keys()), 

130 document_id=document.id, 

131 document_source=document.source, 

132 document_title=document.title, 

133 conversion_method=conversion_method, 

134 ) 

135 

136 # Get strategy class for file type 

137 strategy_class = self.strategies.get(file_type) 

138 

139 if strategy_class: 

140 self.logger.debug( 

141 "Using specific strategy for this file type", 

142 file_type=file_type, 

143 strategy=strategy_class.__name__, 

144 document_id=document.id, 

145 document_title=document.title, 

146 ) 

147 return strategy_class(self.settings) 

148 

149 self.logger.debug( 

150 "No specific strategy found for this file type, using default text chunking strategy", 

151 file_type=file_type, 

152 document_id=document.id, 

153 document_title=document.title, 

154 ) 

155 return self.default_strategy 

156 

157 def chunk_document(self, document: Document) -> list[Document]: 

158 """Chunk a document into smaller pieces. 

159 

160 Args: 

161 document: The document to chunk 

162 

163 Returns: 

164 List of chunked documents 

165 """ 

166 self.logger.debug( 

167 "Starting document chunking", 

168 extra={ 

169 "doc_id": document.id, 

170 "source": document.source, 

171 "source_type": document.source_type, 

172 "content_size": len(document.content), 

173 "content_type": document.content_type, 

174 }, 

175 ) 

176 

177 if not document.content: 

178 # Return a single empty chunk if document has no content 

179 empty_doc = document.model_copy() 

180 empty_doc.metadata.update({"chunk_index": 0, "total_chunks": 1}) 

181 self.logger.debug( 

182 "Empty document, returning single empty chunk", 

183 extra={"doc_id": document.id, "chunk_id": empty_doc.id}, 

184 ) 

185 return [empty_doc] 

186 

187 # Get the appropriate strategy for the document type 

188 strategy = self._get_strategy(document) 

189 self.logger.debug( 

190 "Selected chunking strategy", 

191 extra={ 

192 "doc_id": document.id, 

193 "strategy": strategy.__class__.__name__, 

194 "content_type": document.content_type, 

195 }, 

196 ) 

197 

198 try: 

199 # Chunk the document using the selected strategy 

200 chunked_docs = strategy.chunk_document(document) 

201 self.logger.debug( 

202 "Document chunking completed", 

203 extra={ 

204 "doc_id": document.id, 

205 "chunk_count": len(chunked_docs), 

206 "avg_chunk_size": ( 

207 sum(len(d.content) for d in chunked_docs) / len(chunked_docs) 

208 if chunked_docs 

209 else 0 

210 ), 

211 }, 

212 ) 

213 return chunked_docs 

214 except Exception as e: 

215 self.logger.error( 

216 f"Error chunking document {document.id}: {str(e)}", 

217 extra={ 

218 "doc_id": document.id, 

219 "error": str(e), 

220 "error_type": type(e).__name__, 

221 "strategy": strategy.__class__.__name__, 

222 }, 

223 ) 

224 raise