Coverage for src/qdrant_loader/utils/logging.py: 40%

232 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Centralized logging configuration for the application.""" 

2 

3import logging 

4import re 

5 

6import structlog 

7 

8 

9class QdrantVersionFilter(logging.Filter): 

10 """Filter to suppress Qdrant version check warnings.""" 

11 

12 def filter(self, record): 

13 return "version check" not in record.getMessage().lower() 

14 

15 

16class ApplicationFilter(logging.Filter): 

17 """Filter to only show logs from our application.""" 

18 

19 def filter(self, record): 

20 # Only show logs from our application 

21 return not record.name.startswith(("httpx", "httpcore", "urllib3")) 

22 

23 

24class SQLiteFilter(logging.Filter): 

25 """Filter to suppress verbose SQLite operation logs.""" 

26 

27 def filter(self, record): 

28 # Suppress verbose SQLite debug logs 

29 message = record.getMessage() 

30 sqlite_patterns = [ 

31 "executing functools.partial(<built-in method", 

32 "operation functools.partial(<built-in method", 

33 "executing <function connect.", 

34 "operation <function connect.", 

35 "returning exception", 

36 ] 

37 return not any(pattern in message for pattern in sqlite_patterns) 

38 

39 

40class VerbosityFilter(logging.Filter): 

41 """Filter to reduce verbosity of debug messages.""" 

42 

43 def filter(self, record): 

44 # Only filter DEBUG level messages 

45 if record.levelno != logging.DEBUG: 

46 return True 

47 

48 # Suppress overly verbose debug messages 

49 message = record.getMessage() 

50 verbose_patterns = [ 

51 "HTTP Request:", 

52 "Response status:", 

53 "Request headers:", 

54 "Response headers:", 

55 # PDF parsing debug messages 

56 "seek:", 

57 "nexttoken:", 

58 "do_keyword:", 

59 "nextobject:", 

60 "add_results:", 

61 "register:", 

62 "getobj:", 

63 "get_unichr:", 

64 "exec:", 

65 # Character encoding detection 

66 "confidence =", 

67 "prober hit error", 

68 "not active", 

69 # File processing verbosity 

70 "checking if file should be processed", 

71 "current configuration", 

72 "checking file extension", 

73 "file type detection", 

74 "file supported via", 

75 "starting metadata extraction", 

76 "completed metadata extraction", 

77 "document metadata:", 

78 # HTTP client debug 

79 "connect_tcp.started", 

80 "connect_tcp.complete", 

81 "send_request_headers", 

82 "send_request_body", 

83 "receive_response_headers", 

84 "receive_response_body", 

85 "response_closed", 

86 ] 

87 return not any(pattern in message for pattern in verbose_patterns) 

88 

89 

90class WindowsSafeConsoleHandler(logging.StreamHandler): 

91 """Custom console handler that handles Windows encoding issues.""" 

92 

93 # ANSI escape sequence pattern 

94 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

95 

96 def emit(self, record): 

97 """Emit a record, handling Windows console encoding issues.""" 

98 try: 

99 # Check if stream is still open before writing 

100 if hasattr(self.stream, "closed") and self.stream.closed: 

101 return # Skip logging if stream is closed 

102 

103 # Get the formatted message 

104 msg = self.format(record) 

105 

106 # For Windows, handle Unicode characters proactively 

107 import sys 

108 

109 if sys.platform == "win32": 

110 # Replace problematic Unicode characters with safe alternatives 

111 safe_msg = ( 

112 msg.replace("🚀", "[ROCKET]") 

113 .replace("📄", "[DOCUMENT]") 

114 .replace("✅", "[CHECK]") 

115 .replace("❌", "[CROSS]") 

116 .replace("⚠️", "[WARNING]") 

117 .replace("🔍", "[SEARCH]") 

118 .replace("💾", "[SAVE]") 

119 .replace("🔧", "[TOOL]") 

120 .replace("📊", "[CHART]") 

121 .replace("🎯", "[TARGET]") 

122 .replace("⚙️", "[GEAR]") 

123 .replace("⏱️", "[TIMER]") 

124 .replace("⏭️", "[NEXT]") 

125 .replace("🏗️", "[CONSTRUCTION]") 

126 .replace("1️⃣", "[1]") 

127 .replace("2️⃣", "[2]") 

128 .replace("3️⃣", "[3]") 

129 .replace("4️⃣", "[4]") 

130 ) 

131 try: 

132 self.stream.write(safe_msg + self.terminator) 

133 except UnicodeEncodeError: 

134 # Final fallback: encode with ASCII and replace all unsupported chars 

135 ascii_msg = safe_msg.encode("ascii", errors="replace").decode( 

136 "ascii" 

137 ) 

138 self.stream.write(ascii_msg + self.terminator) 

139 else: 

140 # For non-Windows, use standard handling 

141 self.stream.write(msg + self.terminator) 

142 

143 self.flush() 

144 except (ValueError, OSError): 

145 # Stream is closed or unavailable, skip silently 

146 pass 

147 except Exception: 

148 self.handleError(record) 

149 

150 

151class CleanFileHandler(logging.FileHandler): 

152 """Custom file handler that strips ANSI color codes from log messages.""" 

153 

154 # ANSI escape sequence pattern 

155 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

156 

157 def emit(self, record): 

158 """Emit a record, stripping ANSI codes from the message.""" 

159 try: 

160 # Check if stream is still open before writing 

161 if hasattr(self.stream, "closed") and self.stream.closed: 

162 return # Skip logging if stream is closed 

163 

164 # Get the formatted message 

165 msg = self.format(record) 

166 # Strip ANSI escape sequences 

167 clean_msg = self.ANSI_ESCAPE.sub("", msg) 

168 

169 # Handle Windows console encoding issues more robustly 

170 stream = self.stream 

171 try: 

172 stream.write(clean_msg + self.terminator) 

173 except UnicodeEncodeError: 

174 # Fallback for Windows console encoding issues 

175 # Replace problematic Unicode characters with safe alternatives 

176 import sys 

177 

178 if sys.platform == "win32": 

179 # Replace common emoji/Unicode characters with ASCII equivalents 

180 safe_msg = ( 

181 clean_msg.replace("🚀", "[ROCKET]") 

182 .replace("📄", "[DOCUMENT]") 

183 .replace("✅", "[CHECK]") 

184 .replace("❌", "[CROSS]") 

185 .replace("⚠️", "[WARNING]") 

186 .replace("🔍", "[SEARCH]") 

187 .replace("💾", "[SAVE]") 

188 .replace("🔧", "[TOOL]") 

189 .replace("📊", "[CHART]") 

190 .replace("🎯", "[TARGET]") 

191 .replace("⚙️", "[GEAR]") 

192 .replace("⏱️", "[TIMER]") 

193 .replace("⏭️", "[NEXT]") 

194 .replace("🏗️", "[CONSTRUCTION]") 

195 .replace("1️⃣", "[1]") 

196 .replace("2️⃣", "[2]") 

197 .replace("3️⃣", "[3]") 

198 .replace("4️⃣", "[4]") 

199 ) 

200 try: 

201 stream.write(safe_msg + self.terminator) 

202 except UnicodeEncodeError: 

203 # Final fallback: encode with ASCII and replace all unsupported chars 

204 ascii_msg = safe_msg.encode("ascii", errors="replace").decode( 

205 "ascii" 

206 ) 

207 stream.write(ascii_msg + self.terminator) 

208 else: 

209 # For non-Windows, try UTF-8 encoding with replacement 

210 safe_msg = clean_msg.encode("utf-8", errors="replace").decode( 

211 "utf-8", errors="replace" 

212 ) 

213 stream.write(safe_msg + self.terminator) 

214 self.flush() 

215 except (ValueError, OSError): 

216 # Stream is closed or unavailable, skip silently 

217 pass 

218 except Exception: 

219 self.handleError(record) 

220 

221 

222class CleanFormatter(logging.Formatter): 

223 """Custom formatter that shows only the message for INFO level logs.""" 

224 

225 def __init__(self, use_custom_renderer=False): 

226 super().__init__() 

227 self.use_custom_renderer = use_custom_renderer 

228 

229 def format(self, record): 

230 message = record.getMessage() 

231 

232 # If we're using the custom renderer, just return the message as-is 

233 # since the CustomConsoleRenderer already handled the formatting 

234 if self.use_custom_renderer: 

235 return message 

236 

237 # For INFO level, just show the message 

238 if record.levelno == logging.INFO: 

239 return message 

240 else: 

241 # For other levels, we need to reorder timestamp and level 

242 # Check if message starts with a timestamp (HH:MM:SS format) 

243 # The message might contain ANSI color codes, so we need to account for that 

244 time_pattern = ( 

245 r"^(?:\x1b\[[0-9;]*m)?(\d{2}:\d{2}:\d{2})(?:\x1b\[[0-9;]*m)?\s+(.*)" 

246 ) 

247 match = re.match(time_pattern, message) 

248 

249 if match: 

250 timestamp = match.group(1) 

251 rest_of_message = match.group(2) 

252 

253 # Check if the rest_of_message already contains a level tag 

254 # This can happen when CustomConsoleRenderer already formatted it 

255 level_in_message_pattern = ( 

256 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

257 ) 

258 if re.match(level_in_message_pattern, rest_of_message): 

259 # Level already present, don't add another one 

260 return f"{timestamp} {rest_of_message}" 

261 else: 

262 # Add level tag 

263 return f"{timestamp} [{record.levelname}] {rest_of_message}" 

264 else: 

265 # No timestamp found 

266 # Check if message already has a level tag at the beginning 

267 level_in_message_pattern = ( 

268 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

269 ) 

270 if re.match(level_in_message_pattern, message): 

271 # Level already present, don't add another one 

272 return message 

273 else: 

274 # Add level tag 

275 return f"[{record.levelname}] {message}" 

276 

277 

278class FileRenderer: 

279 """Custom renderer for file output without timestamps (FileFormatter will add them).""" 

280 

281 def __call__(self, logger, method_name, event_dict): 

282 # Extract the main message 

283 event = event_dict.pop("event", "") 

284 

285 # Format additional key-value pairs 

286 if event_dict: 

287 extras = " ".join(f"{k}={v}" for k, v in event_dict.items()) 

288 return f"{event} {extras}".strip() 

289 else: 

290 return event 

291 

292 

293class FileFormatter(logging.Formatter): 

294 """Custom formatter for file output that provides clean, readable logs without ANSI codes.""" 

295 

296 # ANSI escape sequence pattern 

297 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

298 

299 def format(self, record): 

300 # Get the timestamp 

301 timestamp = self.formatTime(record, "%Y-%m-%d %H:%M:%S") 

302 

303 # Get the level name 

304 level = record.levelname 

305 

306 # Get the message (this will be the already formatted structlog message) 

307 message = record.getMessage() 

308 

309 # First, strip ANSI escape sequences 

310 clean_message = self.ANSI_ESCAPE.sub("", message) 

311 

312 # Now check if the clean message starts with a timestamp and remove it 

313 # Pattern for structlog output: "HH:MM:SS message content" 

314 time_pattern = r"^\d{2}:\d{2}:\d{2}\s+" 

315 if re.match(time_pattern, clean_message): 

316 # Remove the structlog timestamp since we're adding our own 

317 clean_message = re.sub(time_pattern, "", clean_message) 

318 

319 # Check if the message already contains a level tag to avoid duplication 

320 level_in_message_pattern = r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

321 has_level_tag = re.match(level_in_message_pattern, clean_message) 

322 

323 # Format based on log level 

324 if record.levelno == logging.INFO: 

325 # For INFO level, use a clean format: timestamp | message 

326 return f"{timestamp} | {clean_message}" 

327 else: 

328 # For other levels, include the level only if not already present 

329 if has_level_tag: 

330 return f"{timestamp} | {clean_message}" 

331 else: 

332 return f"{timestamp} | [{level}] {clean_message}" 

333 

334 

335class CustomConsoleRenderer: 

336 """Custom console renderer that formats timestamp and level correctly.""" 

337 

338 def __init__(self, colors=True): 

339 self.colors = colors 

340 self._console_renderer = structlog.dev.ConsoleRenderer(colors=colors) 

341 # ANSI color codes 

342 self.gray = "\033[90m" if colors else "" 

343 self.green = "\033[92m" if colors else "" # Bright green for INFO 

344 self.yellow = "\033[93m" if colors else "" # Bright yellow for WARNING 

345 self.red = "\033[91m" if colors else "" # Bright red for ERROR 

346 self.magenta = "\033[95m" if colors else "" # Bright magenta for CRITICAL 

347 self.cyan = "\033[96m" if colors else "" # Bright cyan for DEBUG 

348 self.reset = "\033[0m" if colors else "" 

349 

350 def _get_level_color(self, level): 

351 """Get the appropriate color for a log level.""" 

352 level_colors = { 

353 "DEBUG": self.cyan, 

354 "INFO": self.green, # Green for INFO 

355 "WARNING": self.yellow, 

356 "ERROR": self.red, 

357 "CRITICAL": self.magenta, 

358 } 

359 return level_colors.get(level, "") 

360 

361 def __call__(self, logger, method_name, event_dict): 

362 # Extract timestamp if present 

363 timestamp = event_dict.pop("timestamp", None) 

364 

365 # Get the level from method_name 

366 level = method_name.upper() 

367 

368 # Use the default console renderer to format the rest 

369 formatted = self._console_renderer(logger, method_name, event_dict) 

370 

371 # If we have a timestamp 

372 if timestamp and isinstance(timestamp, str) and len(timestamp) >= 8: 

373 time_part = timestamp[:8] # Get HH:MM:SS part 

374 

375 # Remove the timestamp from the formatted message if it's there 

376 if formatted.startswith(time_part): 

377 formatted = formatted[len(time_part) :].lstrip() 

378 

379 # Add gray color to timestamp 

380 colored_timestamp = f"{self.gray}{time_part}{self.reset}" 

381 

382 # Get colored level for all levels including INFO 

383 level_color = self._get_level_color(level) 

384 colored_level = ( 

385 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]" 

386 ) 

387 

388 # Show timestamp, colored level, and message for all levels 

389 return f"{colored_timestamp} {colored_level} {formatted}" 

390 

391 # Fallback if no timestamp 

392 level_color = self._get_level_color(level) 

393 colored_level = ( 

394 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]" 

395 ) 

396 return f"{colored_level} {formatted}" 

397 

398 

399class LoggingConfig: 

400 """Centralized logging configuration.""" 

401 

402 _initialized = False 

403 _current_config = None 

404 

405 @classmethod 

406 def setup( 

407 cls, 

408 level: str = "INFO", 

409 format: str = "console", 

410 file: str | None = None, 

411 suppress_qdrant_warnings: bool = True, 

412 clean_output: bool = True, 

413 ) -> None: 

414 """Setup logging configuration. 

415 

416 Args: 

417 level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) 

418 format: Log format (json or console) 

419 file: Path to log file (optional) 

420 suppress_qdrant_warnings: Whether to suppress Qdrant version check warnings 

421 clean_output: Whether to use clean, less verbose output 

422 """ 

423 try: 

424 # Convert string level to logging level 

425 numeric_level = getattr(logging, level.upper()) 

426 except AttributeError: 

427 raise ValueError(f"Invalid log level: {level}") from None 

428 

429 # Reset logging configuration 

430 logging.getLogger().handlers = [] 

431 structlog.reset_defaults() 

432 

433 # Create a list of handlers 

434 handlers = [] 

435 

436 # Add console handler with Windows encoding support 

437 

438 # Use Windows-safe console handler for all platforms 

439 console_handler = WindowsSafeConsoleHandler() 

440 

441 if clean_output and format == "console": 

442 # Use clean formatter for console output 

443 console_handler.setFormatter(CleanFormatter(use_custom_renderer=True)) 

444 else: 

445 console_handler.setFormatter(logging.Formatter("%(message)s")) 

446 

447 console_handler.addFilter(ApplicationFilter()) # Only show our application logs 

448 console_handler.addFilter(SQLiteFilter()) # Suppress verbose SQLite logs 

449 

450 if clean_output: 

451 console_handler.addFilter(VerbosityFilter()) # Reduce verbosity 

452 

453 handlers.append(console_handler) 

454 

455 # Add file handler if file is configured 

456 if file: 

457 file_handler = CleanFileHandler(file) 

458 file_handler.setFormatter(FileFormatter()) 

459 file_handler.addFilter( 

460 SQLiteFilter() 

461 ) # Suppress verbose SQLite logs in files too 

462 # Don't apply verbosity filter to file logs - keep everything for debugging 

463 handlers.append(file_handler) 

464 

465 # Configure standard logging 

466 logging.basicConfig( 

467 level=numeric_level, 

468 format="%(message)s", 

469 handlers=handlers, 

470 ) 

471 

472 # Apply SQLite filter to all existing loggers 

473 root_logger = logging.getLogger() 

474 for handler in root_logger.handlers: 

475 handler.addFilter(SQLiteFilter()) 

476 

477 # Add filter to suppress Qdrant version check warnings 

478 if suppress_qdrant_warnings: 

479 qdrant_logger = logging.getLogger("qdrant_client") 

480 qdrant_logger.addFilter(QdrantVersionFilter()) 

481 

482 # Suppress verbose SQLAlchemy and SQLite logs 

483 sqlalchemy_loggers = [ 

484 "sqlalchemy.engine", 

485 "sqlalchemy.dialects", 

486 "sqlalchemy.pool", 

487 "aiosqlite", 

488 "sqlite3", 

489 ] 

490 

491 for logger_name in sqlalchemy_loggers: 

492 logger = logging.getLogger(logger_name) 

493 logger.addFilter(SQLiteFilter()) 

494 logger.setLevel(logging.WARNING) # Only show warnings and errors 

495 

496 # Suppress verbose third-party library debug logs 

497 noisy_loggers = [ 

498 "chardet", # Character encoding detection 

499 "chardet.charsetprober", 

500 "chardet.latin1prober", 

501 "chardet.mbcharsetprober", 

502 "chardet.sbcharsetprober", 

503 "chardet.utf8prober", 

504 "pdfminer", # PDF parsing 

505 "pdfminer.pdfparser", 

506 "pdfminer.pdfdocument", 

507 "pdfminer.pdfinterp", 

508 "pdfminer.converter", 

509 "pdfplumber", # PDF processing 

510 "markitdown", # File conversion 

511 "httpcore", # HTTP client debug logs 

512 "httpx", # HTTP client debug logs 

513 "gensim", # Topic modeling library 

514 "gensim.models", # LDA model training logs 

515 "gensim.models.ldamodel", # Specific LDA training logs 

516 "gensim.corpora", # Dictionary and corpus logs 

517 ] 

518 

519 for logger_name in noisy_loggers: 

520 logger = logging.getLogger(logger_name) 

521 logger.setLevel(logging.WARNING) # Only show warnings and errors 

522 

523 # Configure structlog processors based on format and clean_output 

524 # Redaction processor to mask sensitive fields in event_dict 

525 def _redact_sensitive(logger, method_name, event_dict): # type: ignore[no-redef] 

526 sensitive_keys = { 

527 "api_key", 

528 "llm_api_key", 

529 "authorization", 

530 "Authorization", 

531 "token", 

532 "access_token", 

533 "secret", 

534 "password", 

535 } 

536 

537 def _mask(value: str) -> str: 

538 try: 

539 if not isinstance(value, str) or not value: 

540 return "***REDACTED***" 

541 # Preserve small hint while masking majority 

542 if len(value) <= 8: 

543 return "***REDACTED***" 

544 return value[:2] + "***REDACTED***" + value[-2:] 

545 except Exception: 

546 return "***REDACTED***" 

547 

548 def _deep_redact(obj): 

549 try: 

550 if isinstance(obj, dict): 

551 red = {} 

552 for k, v in obj.items(): 

553 if k in sensitive_keys: 

554 red[k] = ( 

555 _mask(v) if isinstance(v, str) else "***REDACTED***" 

556 ) 

557 else: 

558 red[k] = _deep_redact(v) 

559 return red 

560 if isinstance(obj, list): 

561 return [_deep_redact(i) for i in obj] 

562 return obj 

563 except Exception: 

564 return obj 

565 

566 return _deep_redact(event_dict) 

567 

568 if clean_output and format == "console": 

569 # Minimal processors for clean output 

570 processors = [ 

571 structlog.stdlib.filter_by_level, 

572 structlog.processors.TimeStamper(fmt="%H:%M:%S"), 

573 _redact_sensitive, 

574 CustomConsoleRenderer(colors=True), 

575 ] 

576 else: 

577 # Full processors for detailed output 

578 processors = [ 

579 structlog.stdlib.filter_by_level, 

580 structlog.stdlib.add_logger_name, 

581 structlog.stdlib.add_log_level, 

582 structlog.processors.TimeStamper(fmt="iso"), 

583 structlog.processors.StackInfoRenderer(), 

584 structlog.processors.UnicodeDecoder(), 

585 structlog.processors.CallsiteParameterAdder( 

586 [ 

587 structlog.processors.CallsiteParameter.FILENAME, 

588 structlog.processors.CallsiteParameter.FUNC_NAME, 

589 structlog.processors.CallsiteParameter.LINENO, 

590 ] 

591 ), 

592 _redact_sensitive, 

593 ] 

594 

595 if format == "json": 

596 processors.append(structlog.processors.JSONRenderer()) 

597 else: 

598 processors.append(structlog.dev.ConsoleRenderer(colors=True)) 

599 

600 # Configure structlog 

601 structlog.configure( 

602 processors=processors, 

603 wrapper_class=structlog.make_filtering_bound_logger(numeric_level), 

604 logger_factory=structlog.stdlib.LoggerFactory(), 

605 cache_logger_on_first_use=False, # Disable caching to ensure new configuration is used 

606 ) 

607 

608 cls._initialized = True 

609 cls._current_config = ( 

610 level, 

611 format, 

612 file, 

613 suppress_qdrant_warnings, 

614 clean_output, 

615 ) 

616 

617 @classmethod 

618 def get_logger(cls, name: str | None = None) -> structlog.BoundLogger: 

619 """Get a logger instance. 

620 

621 Args: 

622 name: Logger name. If None, will use the calling module's name. 

623 

624 Returns: 

625 structlog.BoundLogger: Logger instance 

626 """ 

627 if not cls._initialized: 

628 # Initialize with default settings if not already initialized 

629 cls.setup() 

630 return structlog.get_logger(name) 

631 

632 

633# Standardize on core logging configuration 

634try: 

635 from qdrant_loader_core.logging import ( 

636 LoggingConfig as _CoreLoggingConfig, # type: ignore 

637 ) 

638 

639 LoggingConfig = _CoreLoggingConfig # type: ignore[assignment] 

640except Exception: 

641 # Fallback to local implementation if core is unavailable 

642 pass