Coverage for src/qdrant_loader/utils/logging.py: 73%

193 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:39 +0000

1"""Centralized logging configuration for the application.""" 

2 

3import logging 

4import re 

5 

6import structlog 

7 

8 

9class QdrantVersionFilter(logging.Filter): 

10 """Filter to suppress Qdrant version check warnings.""" 

11 

12 def filter(self, record): 

13 return "version check" not in record.getMessage().lower() 

14 

15 

16class ApplicationFilter(logging.Filter): 

17 """Filter to only show logs from our application.""" 

18 

19 def filter(self, record): 

20 # Only show logs from our application 

21 return not record.name.startswith(("httpx", "httpcore", "urllib3")) 

22 

23 

24class SQLiteFilter(logging.Filter): 

25 """Filter to suppress verbose SQLite operation logs.""" 

26 

27 def filter(self, record): 

28 # Suppress verbose SQLite debug logs 

29 message = record.getMessage() 

30 sqlite_patterns = [ 

31 "executing functools.partial(<built-in method", 

32 "operation functools.partial(<built-in method", 

33 "executing <function connect.", 

34 "operation <function connect.", 

35 "returning exception", 

36 ] 

37 return not any(pattern in message for pattern in sqlite_patterns) 

38 

39 

40class VerbosityFilter(logging.Filter): 

41 """Filter to reduce verbosity of debug messages.""" 

42 

43 def filter(self, record): 

44 # Only filter DEBUG level messages 

45 if record.levelno != logging.DEBUG: 

46 return True 

47 

48 # Suppress overly verbose debug messages 

49 message = record.getMessage() 

50 verbose_patterns = [ 

51 "HTTP Request:", 

52 "Response status:", 

53 "Request headers:", 

54 "Response headers:", 

55 # PDF parsing debug messages 

56 "seek:", 

57 "nexttoken:", 

58 "do_keyword:", 

59 "nextobject:", 

60 "add_results:", 

61 "register:", 

62 "getobj:", 

63 "get_unichr:", 

64 "exec:", 

65 # Character encoding detection 

66 "confidence =", 

67 "prober hit error", 

68 "not active", 

69 # File processing verbosity 

70 "checking if file should be processed", 

71 "current configuration", 

72 "checking file extension", 

73 "file type detection", 

74 "file supported via", 

75 "starting metadata extraction", 

76 "completed metadata extraction", 

77 "document metadata:", 

78 # HTTP client debug 

79 "connect_tcp.started", 

80 "connect_tcp.complete", 

81 "send_request_headers", 

82 "send_request_body", 

83 "receive_response_headers", 

84 "receive_response_body", 

85 "response_closed", 

86 ] 

87 return not any(pattern in message for pattern in verbose_patterns) 

88 

89 

90class WindowsSafeConsoleHandler(logging.StreamHandler): 

91 """Custom console handler that handles Windows encoding issues.""" 

92 

93 # ANSI escape sequence pattern 

94 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

95 

96 def emit(self, record): 

97 """Emit a record, handling Windows console encoding issues.""" 

98 try: 

99 # Get the formatted message 

100 msg = self.format(record) 

101 

102 # For Windows, handle Unicode characters proactively 

103 import sys 

104 

105 if sys.platform == "win32": 

106 # Replace problematic Unicode characters with safe alternatives 

107 safe_msg = ( 

108 msg.replace("🚀", "[ROCKET]") 

109 .replace("📄", "[DOCUMENT]") 

110 .replace("✅", "[CHECK]") 

111 .replace("❌", "[CROSS]") 

112 .replace("⚠️", "[WARNING]") 

113 .replace("🔍", "[SEARCH]") 

114 .replace("💾", "[SAVE]") 

115 .replace("🔧", "[TOOL]") 

116 .replace("📊", "[CHART]") 

117 .replace("🎯", "[TARGET]") 

118 .replace("⚙️", "[GEAR]") 

119 .replace("⏱️", "[TIMER]") 

120 .replace("⏭️", "[NEXT]") 

121 .replace("🏗️", "[CONSTRUCTION]") 

122 .replace("1️⃣", "[1]") 

123 .replace("2️⃣", "[2]") 

124 .replace("3️⃣", "[3]") 

125 .replace("4️⃣", "[4]") 

126 ) 

127 try: 

128 self.stream.write(safe_msg + self.terminator) 

129 except UnicodeEncodeError: 

130 # Final fallback: encode with ASCII and replace all unsupported chars 

131 ascii_msg = safe_msg.encode("ascii", errors="replace").decode( 

132 "ascii" 

133 ) 

134 self.stream.write(ascii_msg + self.terminator) 

135 else: 

136 # For non-Windows, use standard handling 

137 self.stream.write(msg + self.terminator) 

138 

139 self.flush() 

140 except Exception: 

141 self.handleError(record) 

142 

143 

144class CleanFileHandler(logging.FileHandler): 

145 """Custom file handler that strips ANSI color codes from log messages.""" 

146 

147 # ANSI escape sequence pattern 

148 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

149 

150 def emit(self, record): 

151 """Emit a record, stripping ANSI codes from the message.""" 

152 try: 

153 # Get the formatted message 

154 msg = self.format(record) 

155 # Strip ANSI escape sequences 

156 clean_msg = self.ANSI_ESCAPE.sub("", msg) 

157 

158 # Handle Windows console encoding issues more robustly 

159 stream = self.stream 

160 try: 

161 stream.write(clean_msg + self.terminator) 

162 except UnicodeEncodeError: 

163 # Fallback for Windows console encoding issues 

164 # Replace problematic Unicode characters with safe alternatives 

165 import sys 

166 

167 if sys.platform == "win32": 

168 # Replace common emoji/Unicode characters with ASCII equivalents 

169 safe_msg = ( 

170 clean_msg.replace("🚀", "[ROCKET]") 

171 .replace("📄", "[DOCUMENT]") 

172 .replace("✅", "[CHECK]") 

173 .replace("❌", "[CROSS]") 

174 .replace("⚠️", "[WARNING]") 

175 .replace("🔍", "[SEARCH]") 

176 .replace("💾", "[SAVE]") 

177 .replace("🔧", "[TOOL]") 

178 .replace("📊", "[CHART]") 

179 .replace("🎯", "[TARGET]") 

180 .replace("⚙️", "[GEAR]") 

181 .replace("⏱️", "[TIMER]") 

182 .replace("⏭️", "[NEXT]") 

183 .replace("🏗️", "[CONSTRUCTION]") 

184 .replace("1️⃣", "[1]") 

185 .replace("2️⃣", "[2]") 

186 .replace("3️⃣", "[3]") 

187 .replace("4️⃣", "[4]") 

188 ) 

189 try: 

190 stream.write(safe_msg + self.terminator) 

191 except UnicodeEncodeError: 

192 # Final fallback: encode with ASCII and replace all unsupported chars 

193 ascii_msg = safe_msg.encode("ascii", errors="replace").decode( 

194 "ascii" 

195 ) 

196 stream.write(ascii_msg + self.terminator) 

197 else: 

198 # For non-Windows, try UTF-8 encoding with replacement 

199 safe_msg = clean_msg.encode("utf-8", errors="replace").decode( 

200 "utf-8", errors="replace" 

201 ) 

202 stream.write(safe_msg + self.terminator) 

203 self.flush() 

204 except Exception: 

205 self.handleError(record) 

206 

207 

208class CleanFormatter(logging.Formatter): 

209 """Custom formatter that shows only the message for INFO level logs.""" 

210 

211 def __init__(self, use_custom_renderer=False): 

212 super().__init__() 

213 self.use_custom_renderer = use_custom_renderer 

214 

215 def format(self, record): 

216 message = record.getMessage() 

217 

218 # If we're using the custom renderer, just return the message as-is 

219 # since the CustomConsoleRenderer already handled the formatting 

220 if self.use_custom_renderer: 

221 return message 

222 

223 # For INFO level, just show the message 

224 if record.levelno == logging.INFO: 

225 return message 

226 else: 

227 # For other levels, we need to reorder timestamp and level 

228 # Check if message starts with a timestamp (HH:MM:SS format) 

229 # The message might contain ANSI color codes, so we need to account for that 

230 time_pattern = ( 

231 r"^(?:\x1b\[[0-9;]*m)?(\d{2}:\d{2}:\d{2})(?:\x1b\[[0-9;]*m)?\s+(.*)" 

232 ) 

233 match = re.match(time_pattern, message) 

234 

235 if match: 

236 timestamp = match.group(1) 

237 rest_of_message = match.group(2) 

238 

239 # Check if the rest_of_message already contains a level tag 

240 # This can happen when CustomConsoleRenderer already formatted it 

241 level_in_message_pattern = ( 

242 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

243 ) 

244 if re.match(level_in_message_pattern, rest_of_message): 

245 # Level already present, don't add another one 

246 return f"{timestamp} {rest_of_message}" 

247 else: 

248 # Add level tag 

249 return f"{timestamp} [{record.levelname}] {rest_of_message}" 

250 else: 

251 # No timestamp found 

252 # Check if message already has a level tag at the beginning 

253 level_in_message_pattern = ( 

254 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

255 ) 

256 if re.match(level_in_message_pattern, message): 

257 # Level already present, don't add another one 

258 return message 

259 else: 

260 # Add level tag 

261 return f"[{record.levelname}] {message}" 

262 

263 

264class FileRenderer: 

265 """Custom renderer for file output without timestamps (FileFormatter will add them).""" 

266 

267 def __call__(self, logger, method_name, event_dict): 

268 # Extract the main message 

269 event = event_dict.pop("event", "") 

270 

271 # Format additional key-value pairs 

272 if event_dict: 

273 extras = " ".join(f"{k}={v}" for k, v in event_dict.items()) 

274 return f"{event} {extras}".strip() 

275 else: 

276 return event 

277 

278 

279class FileFormatter(logging.Formatter): 

280 """Custom formatter for file output that provides clean, readable logs without ANSI codes.""" 

281 

282 # ANSI escape sequence pattern 

283 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") 

284 

285 def format(self, record): 

286 # Get the timestamp 

287 timestamp = self.formatTime(record, "%Y-%m-%d %H:%M:%S") 

288 

289 # Get the level name 

290 level = record.levelname 

291 

292 # Get the message (this will be the already formatted structlog message) 

293 message = record.getMessage() 

294 

295 # First, strip ANSI escape sequences 

296 clean_message = self.ANSI_ESCAPE.sub("", message) 

297 

298 # Now check if the clean message starts with a timestamp and remove it 

299 # Pattern for structlog output: "HH:MM:SS message content" 

300 time_pattern = r"^\d{2}:\d{2}:\d{2}\s+" 

301 if re.match(time_pattern, clean_message): 

302 # Remove the structlog timestamp since we're adding our own 

303 clean_message = re.sub(time_pattern, "", clean_message) 

304 

305 # Check if the message already contains a level tag to avoid duplication 

306 level_in_message_pattern = r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s" 

307 has_level_tag = re.match(level_in_message_pattern, clean_message) 

308 

309 # Format based on log level 

310 if record.levelno == logging.INFO: 

311 # For INFO level, use a clean format: timestamp | message 

312 return f"{timestamp} | {clean_message}" 

313 else: 

314 # For other levels, include the level only if not already present 

315 if has_level_tag: 

316 return f"{timestamp} | {clean_message}" 

317 else: 

318 return f"{timestamp} | [{level}] {clean_message}" 

319 

320 

321class CustomConsoleRenderer: 

322 """Custom console renderer that formats timestamp and level correctly.""" 

323 

324 def __init__(self, colors=True): 

325 self.colors = colors 

326 self._console_renderer = structlog.dev.ConsoleRenderer(colors=colors) 

327 # ANSI color codes 

328 self.gray = "\033[90m" if colors else "" 

329 self.green = "\033[92m" if colors else "" # Bright green for INFO 

330 self.yellow = "\033[93m" if colors else "" # Bright yellow for WARNING 

331 self.red = "\033[91m" if colors else "" # Bright red for ERROR 

332 self.magenta = "\033[95m" if colors else "" # Bright magenta for CRITICAL 

333 self.cyan = "\033[96m" if colors else "" # Bright cyan for DEBUG 

334 self.reset = "\033[0m" if colors else "" 

335 

336 def _get_level_color(self, level): 

337 """Get the appropriate color for a log level.""" 

338 level_colors = { 

339 "DEBUG": self.cyan, 

340 "INFO": self.green, # Green for INFO 

341 "WARNING": self.yellow, 

342 "ERROR": self.red, 

343 "CRITICAL": self.magenta, 

344 } 

345 return level_colors.get(level, "") 

346 

347 def __call__(self, logger, method_name, event_dict): 

348 # Extract timestamp if present 

349 timestamp = event_dict.pop("timestamp", None) 

350 

351 # Get the level from method_name 

352 level = method_name.upper() 

353 

354 # Use the default console renderer to format the rest 

355 formatted = self._console_renderer(logger, method_name, event_dict) 

356 

357 # If we have a timestamp 

358 if timestamp and isinstance(timestamp, str) and len(timestamp) >= 8: 

359 time_part = timestamp[:8] # Get HH:MM:SS part 

360 

361 # Remove the timestamp from the formatted message if it's there 

362 if formatted.startswith(time_part): 

363 formatted = formatted[len(time_part) :].lstrip() 

364 

365 # Add gray color to timestamp 

366 colored_timestamp = f"{self.gray}{time_part}{self.reset}" 

367 

368 # Get colored level for all levels including INFO 

369 level_color = self._get_level_color(level) 

370 colored_level = ( 

371 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]" 

372 ) 

373 

374 # Show timestamp, colored level, and message for all levels 

375 return f"{colored_timestamp} {colored_level} {formatted}" 

376 

377 # Fallback if no timestamp 

378 level_color = self._get_level_color(level) 

379 colored_level = ( 

380 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]" 

381 ) 

382 return f"{colored_level} {formatted}" 

383 

384 

385class LoggingConfig: 

386 """Centralized logging configuration.""" 

387 

388 _initialized = False 

389 _current_config = None 

390 

391 @classmethod 

392 def setup( 

393 cls, 

394 level: str = "INFO", 

395 format: str = "console", 

396 file: str | None = None, 

397 suppress_qdrant_warnings: bool = True, 

398 clean_output: bool = True, 

399 ) -> None: 

400 """Setup logging configuration. 

401 

402 Args: 

403 level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) 

404 format: Log format (json or console) 

405 file: Path to log file (optional) 

406 suppress_qdrant_warnings: Whether to suppress Qdrant version check warnings 

407 clean_output: Whether to use clean, less verbose output 

408 """ 

409 try: 

410 # Convert string level to logging level 

411 numeric_level = getattr(logging, level.upper()) 

412 except AttributeError: 

413 raise ValueError(f"Invalid log level: {level}") from None 

414 

415 # Reset logging configuration 

416 logging.getLogger().handlers = [] 

417 structlog.reset_defaults() 

418 

419 # Create a list of handlers 

420 handlers = [] 

421 

422 # Add console handler with Windows encoding support 

423 

424 # Use Windows-safe console handler for all platforms 

425 console_handler = WindowsSafeConsoleHandler() 

426 

427 if clean_output and format == "console": 

428 # Use clean formatter for console output 

429 console_handler.setFormatter(CleanFormatter(use_custom_renderer=True)) 

430 else: 

431 console_handler.setFormatter(logging.Formatter("%(message)s")) 

432 

433 console_handler.addFilter(ApplicationFilter()) # Only show our application logs 

434 console_handler.addFilter(SQLiteFilter()) # Suppress verbose SQLite logs 

435 

436 if clean_output: 

437 console_handler.addFilter(VerbosityFilter()) # Reduce verbosity 

438 

439 handlers.append(console_handler) 

440 

441 # Add file handler if file is configured 

442 if file: 

443 file_handler = CleanFileHandler(file) 

444 file_handler.setFormatter(FileFormatter()) 

445 file_handler.addFilter( 

446 SQLiteFilter() 

447 ) # Suppress verbose SQLite logs in files too 

448 # Don't apply verbosity filter to file logs - keep everything for debugging 

449 handlers.append(file_handler) 

450 

451 # Configure standard logging 

452 logging.basicConfig( 

453 level=numeric_level, 

454 format="%(message)s", 

455 handlers=handlers, 

456 ) 

457 

458 # Apply SQLite filter to all existing loggers 

459 root_logger = logging.getLogger() 

460 for handler in root_logger.handlers: 

461 handler.addFilter(SQLiteFilter()) 

462 

463 # Add filter to suppress Qdrant version check warnings 

464 if suppress_qdrant_warnings: 

465 qdrant_logger = logging.getLogger("qdrant_client") 

466 qdrant_logger.addFilter(QdrantVersionFilter()) 

467 

468 # Suppress verbose SQLAlchemy and SQLite logs 

469 sqlalchemy_loggers = [ 

470 "sqlalchemy.engine", 

471 "sqlalchemy.dialects", 

472 "sqlalchemy.pool", 

473 "aiosqlite", 

474 "sqlite3", 

475 ] 

476 

477 for logger_name in sqlalchemy_loggers: 

478 logger = logging.getLogger(logger_name) 

479 logger.addFilter(SQLiteFilter()) 

480 logger.setLevel(logging.WARNING) # Only show warnings and errors 

481 

482 # Suppress verbose third-party library debug logs 

483 noisy_loggers = [ 

484 "chardet", # Character encoding detection 

485 "chardet.charsetprober", 

486 "chardet.latin1prober", 

487 "chardet.mbcharsetprober", 

488 "chardet.sbcharsetprober", 

489 "chardet.utf8prober", 

490 "pdfminer", # PDF parsing 

491 "pdfminer.pdfparser", 

492 "pdfminer.pdfdocument", 

493 "pdfminer.pdfinterp", 

494 "pdfminer.converter", 

495 "pdfplumber", # PDF processing 

496 "markitdown", # File conversion 

497 "httpcore", # HTTP client debug logs 

498 "httpx", # HTTP client debug logs 

499 "gensim", # Topic modeling library 

500 "gensim.models", # LDA model training logs 

501 "gensim.models.ldamodel", # Specific LDA training logs 

502 "gensim.corpora", # Dictionary and corpus logs 

503 ] 

504 

505 for logger_name in noisy_loggers: 

506 logger = logging.getLogger(logger_name) 

507 logger.setLevel(logging.WARNING) # Only show warnings and errors 

508 

509 # Configure structlog processors based on format and clean_output 

510 if clean_output and format == "console": 

511 # Minimal processors for clean output 

512 processors = [ 

513 structlog.stdlib.filter_by_level, 

514 structlog.processors.TimeStamper(fmt="%H:%M:%S"), 

515 CustomConsoleRenderer(colors=True), 

516 ] 

517 else: 

518 # Full processors for detailed output 

519 processors = [ 

520 structlog.stdlib.filter_by_level, 

521 structlog.stdlib.add_logger_name, 

522 structlog.stdlib.add_log_level, 

523 structlog.processors.TimeStamper(fmt="iso"), 

524 structlog.processors.StackInfoRenderer(), 

525 structlog.processors.UnicodeDecoder(), 

526 structlog.processors.CallsiteParameterAdder( 

527 [ 

528 structlog.processors.CallsiteParameter.FILENAME, 

529 structlog.processors.CallsiteParameter.FUNC_NAME, 

530 structlog.processors.CallsiteParameter.LINENO, 

531 ] 

532 ), 

533 ] 

534 

535 if format == "json": 

536 processors.append(structlog.processors.JSONRenderer()) 

537 else: 

538 processors.append(structlog.dev.ConsoleRenderer(colors=True)) 

539 

540 # Configure structlog 

541 structlog.configure( 

542 processors=processors, 

543 wrapper_class=structlog.make_filtering_bound_logger(numeric_level), 

544 logger_factory=structlog.stdlib.LoggerFactory(), 

545 cache_logger_on_first_use=False, # Disable caching to ensure new configuration is used 

546 ) 

547 

548 cls._initialized = True 

549 cls._current_config = ( 

550 level, 

551 format, 

552 file, 

553 suppress_qdrant_warnings, 

554 clean_output, 

555 ) 

556 

557 @classmethod 

558 def get_logger(cls, name: str | None = None) -> structlog.BoundLogger: 

559 """Get a logger instance. 

560 

561 Args: 

562 name: Logger name. If None, will use the calling module's name. 

563 

564 Returns: 

565 structlog.BoundLogger: Logger instance 

566 """ 

567 if not cls._initialized: 

568 # Initialize with default settings if not already initialized 

569 cls.setup() 

570 return structlog.get_logger(name)