Coverage for src/qdrant_loader/core/chunking/chunking_service.py: 100%
60 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Service for chunking documents."""
3from pathlib import Path
5from qdrant_loader.config import GlobalConfig, Settings
6from qdrant_loader.core.chunking.strategy import (
7 BaseChunkingStrategy,
8 CodeChunkingStrategy,
9 DefaultChunkingStrategy,
10 HTMLChunkingStrategy,
11 JSONChunkingStrategy,
12 MarkdownChunkingStrategy,
13)
14from qdrant_loader.core.document import Document
15from qdrant_loader.core.monitoring.ingestion_metrics import IngestionMonitor
16from qdrant_loader.utils.logging import LoggingConfig
19class ChunkingService:
20 """Service for chunking documents into smaller pieces."""
22 def __new__(cls, config: GlobalConfig, settings: Settings):
23 """Create a new instance of ChunkingService.
25 Args:
26 config: Global configuration
27 settings: Application settings
28 """
29 instance = super().__new__(cls)
30 instance.__init__(config, settings)
31 return instance
33 def __init__(self, config: GlobalConfig, settings: Settings):
34 """Initialize the chunking service.
36 Args:
37 config: Global configuration
38 settings: Application settings
39 """
40 self.config = config
41 self.settings = settings
42 self.validate_config()
43 self.logger = LoggingConfig.get_logger(__name__)
45 # Initialize metrics directory
46 metrics_dir = Path.cwd() / "metrics"
47 metrics_dir.mkdir(parents=True, exist_ok=True)
48 self.monitor = IngestionMonitor(str(metrics_dir.absolute()))
50 # Initialize strategies
51 self.strategies: dict[str, type[BaseChunkingStrategy]] = {
52 "md": MarkdownChunkingStrategy,
53 "html": HTMLChunkingStrategy,
54 # JSON files
55 "json": JSONChunkingStrategy,
56 # Programming languages
57 "py": CodeChunkingStrategy,
58 "java": CodeChunkingStrategy,
59 "js": CodeChunkingStrategy,
60 "ts": CodeChunkingStrategy,
61 "go": CodeChunkingStrategy,
62 "rs": CodeChunkingStrategy,
63 "cpp": CodeChunkingStrategy,
64 "c": CodeChunkingStrategy,
65 "cs": CodeChunkingStrategy,
66 "php": CodeChunkingStrategy,
67 "rb": CodeChunkingStrategy,
68 "kt": CodeChunkingStrategy,
69 "swift": CodeChunkingStrategy,
70 "scala": CodeChunkingStrategy,
71 # Add more strategies here as needed
72 }
74 # Default strategy for unknown file types
75 self.default_strategy = DefaultChunkingStrategy(settings=self.settings)
77 def validate_config(self) -> None:
78 """Validate the configuration.
80 Raises:
81 ValueError: If chunk size or overlap parameters are invalid.
82 """
83 if self.config.chunking.chunk_size <= 0:
84 raise ValueError("Chunk size must be greater than 0")
85 if self.config.chunking.chunk_overlap < 0:
86 raise ValueError("Chunk overlap must be non-negative")
87 if self.config.chunking.chunk_overlap >= self.config.chunking.chunk_size:
88 raise ValueError("Chunk overlap must be less than chunk size")
90 def _get_strategy(self, document: Document) -> BaseChunkingStrategy:
91 """Get the appropriate chunking strategy for a document.
93 Args:
94 document: The document to chunk
96 Returns:
97 The appropriate chunking strategy for the document type
98 """
99 # Check if this is a converted file
100 conversion_method = document.metadata.get("conversion_method")
101 if conversion_method == "markitdown":
102 # Files converted with MarkItDown are now in markdown format
103 self.logger.info(
104 "Using markdown strategy for converted file",
105 original_file_type=document.metadata.get("original_file_type"),
106 conversion_method=conversion_method,
107 document_id=document.id,
108 document_title=document.title,
109 )
110 return MarkdownChunkingStrategy(self.settings)
111 elif conversion_method == "markitdown_fallback":
112 # Fallback documents are also in markdown format
113 self.logger.info(
114 "Using markdown strategy for fallback converted file",
115 original_file_type=document.metadata.get("original_file_type"),
116 conversion_method=conversion_method,
117 conversion_failed=document.metadata.get("conversion_failed", False),
118 document_id=document.id,
119 document_title=document.title,
120 )
121 return MarkdownChunkingStrategy(self.settings)
123 # Get file extension from the document content type
124 file_type = document.content_type.lower()
126 self.logger.debug(
127 "Selecting chunking strategy",
128 file_type=file_type,
129 available_strategies=list(self.strategies.keys()),
130 document_id=document.id,
131 document_source=document.source,
132 document_title=document.title,
133 conversion_method=conversion_method,
134 )
136 # Get strategy class for file type
137 strategy_class = self.strategies.get(file_type)
139 if strategy_class:
140 self.logger.debug(
141 "Using specific strategy for this file type",
142 file_type=file_type,
143 strategy=strategy_class.__name__,
144 document_id=document.id,
145 document_title=document.title,
146 )
147 return strategy_class(self.settings)
149 self.logger.debug(
150 "No specific strategy found for this file type, using default text chunking strategy",
151 file_type=file_type,
152 document_id=document.id,
153 document_title=document.title,
154 )
155 return self.default_strategy
157 def chunk_document(self, document: Document) -> list[Document]:
158 """Chunk a document into smaller pieces.
160 Args:
161 document: The document to chunk
163 Returns:
164 List of chunked documents
165 """
166 self.logger.debug(
167 "Starting document chunking",
168 extra={
169 "doc_id": document.id,
170 "source": document.source,
171 "source_type": document.source_type,
172 "content_size": len(document.content),
173 "content_type": document.content_type,
174 },
175 )
177 if not document.content:
178 # Return a single empty chunk if document has no content
179 empty_doc = document.model_copy()
180 empty_doc.metadata.update({"chunk_index": 0, "total_chunks": 1})
181 self.logger.debug(
182 "Empty document, returning single empty chunk",
183 extra={"doc_id": document.id, "chunk_id": empty_doc.id},
184 )
185 return [empty_doc]
187 # Get the appropriate strategy for the document type
188 strategy = self._get_strategy(document)
189 self.logger.debug(
190 "Selected chunking strategy",
191 extra={
192 "doc_id": document.id,
193 "strategy": strategy.__class__.__name__,
194 "content_type": document.content_type,
195 },
196 )
198 try:
199 # Chunk the document using the selected strategy
200 chunked_docs = strategy.chunk_document(document)
201 self.logger.debug(
202 "Document chunking completed",
203 extra={
204 "doc_id": document.id,
205 "chunk_count": len(chunked_docs),
206 "avg_chunk_size": (
207 sum(len(d.content) for d in chunked_docs) / len(chunked_docs)
208 if chunked_docs
209 else 0
210 ),
211 },
212 )
213 return chunked_docs
214 except Exception as e:
215 self.logger.error(
216 f"Error chunking document {document.id}: {str(e)}",
217 extra={
218 "doc_id": document.id,
219 "error": str(e),
220 "error_type": type(e).__name__,
221 "strategy": strategy.__class__.__name__,
222 },
223 )
224 raise