Coverage for src/qdrant_loader/utils/logging.py: 73%
193 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
1"""Centralized logging configuration for the application."""
3import logging
4import re
6import structlog
9class QdrantVersionFilter(logging.Filter):
10 """Filter to suppress Qdrant version check warnings."""
12 def filter(self, record):
13 return "version check" not in record.getMessage().lower()
16class ApplicationFilter(logging.Filter):
17 """Filter to only show logs from our application."""
19 def filter(self, record):
20 # Only show logs from our application
21 return not record.name.startswith(("httpx", "httpcore", "urllib3"))
24class SQLiteFilter(logging.Filter):
25 """Filter to suppress verbose SQLite operation logs."""
27 def filter(self, record):
28 # Suppress verbose SQLite debug logs
29 message = record.getMessage()
30 sqlite_patterns = [
31 "executing functools.partial(<built-in method",
32 "operation functools.partial(<built-in method",
33 "executing <function connect.",
34 "operation <function connect.",
35 "returning exception",
36 ]
37 return not any(pattern in message for pattern in sqlite_patterns)
40class VerbosityFilter(logging.Filter):
41 """Filter to reduce verbosity of debug messages."""
43 def filter(self, record):
44 # Only filter DEBUG level messages
45 if record.levelno != logging.DEBUG:
46 return True
48 # Suppress overly verbose debug messages
49 message = record.getMessage()
50 verbose_patterns = [
51 "HTTP Request:",
52 "Response status:",
53 "Request headers:",
54 "Response headers:",
55 # PDF parsing debug messages
56 "seek:",
57 "nexttoken:",
58 "do_keyword:",
59 "nextobject:",
60 "add_results:",
61 "register:",
62 "getobj:",
63 "get_unichr:",
64 "exec:",
65 # Character encoding detection
66 "confidence =",
67 "prober hit error",
68 "not active",
69 # File processing verbosity
70 "checking if file should be processed",
71 "current configuration",
72 "checking file extension",
73 "file type detection",
74 "file supported via",
75 "starting metadata extraction",
76 "completed metadata extraction",
77 "document metadata:",
78 # HTTP client debug
79 "connect_tcp.started",
80 "connect_tcp.complete",
81 "send_request_headers",
82 "send_request_body",
83 "receive_response_headers",
84 "receive_response_body",
85 "response_closed",
86 ]
87 return not any(pattern in message for pattern in verbose_patterns)
90class WindowsSafeConsoleHandler(logging.StreamHandler):
91 """Custom console handler that handles Windows encoding issues."""
93 # ANSI escape sequence pattern
94 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
96 def emit(self, record):
97 """Emit a record, handling Windows console encoding issues."""
98 try:
99 # Get the formatted message
100 msg = self.format(record)
102 # For Windows, handle Unicode characters proactively
103 import sys
105 if sys.platform == "win32":
106 # Replace problematic Unicode characters with safe alternatives
107 safe_msg = (
108 msg.replace("🚀", "[ROCKET]")
109 .replace("📄", "[DOCUMENT]")
110 .replace("✅", "[CHECK]")
111 .replace("❌", "[CROSS]")
112 .replace("⚠️", "[WARNING]")
113 .replace("🔍", "[SEARCH]")
114 .replace("💾", "[SAVE]")
115 .replace("🔧", "[TOOL]")
116 .replace("📊", "[CHART]")
117 .replace("🎯", "[TARGET]")
118 .replace("⚙️", "[GEAR]")
119 .replace("⏱️", "[TIMER]")
120 .replace("⏭️", "[NEXT]")
121 .replace("🏗️", "[CONSTRUCTION]")
122 .replace("1️⃣", "[1]")
123 .replace("2️⃣", "[2]")
124 .replace("3️⃣", "[3]")
125 .replace("4️⃣", "[4]")
126 )
127 try:
128 self.stream.write(safe_msg + self.terminator)
129 except UnicodeEncodeError:
130 # Final fallback: encode with ASCII and replace all unsupported chars
131 ascii_msg = safe_msg.encode("ascii", errors="replace").decode(
132 "ascii"
133 )
134 self.stream.write(ascii_msg + self.terminator)
135 else:
136 # For non-Windows, use standard handling
137 self.stream.write(msg + self.terminator)
139 self.flush()
140 except Exception:
141 self.handleError(record)
144class CleanFileHandler(logging.FileHandler):
145 """Custom file handler that strips ANSI color codes from log messages."""
147 # ANSI escape sequence pattern
148 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
150 def emit(self, record):
151 """Emit a record, stripping ANSI codes from the message."""
152 try:
153 # Get the formatted message
154 msg = self.format(record)
155 # Strip ANSI escape sequences
156 clean_msg = self.ANSI_ESCAPE.sub("", msg)
158 # Handle Windows console encoding issues more robustly
159 stream = self.stream
160 try:
161 stream.write(clean_msg + self.terminator)
162 except UnicodeEncodeError:
163 # Fallback for Windows console encoding issues
164 # Replace problematic Unicode characters with safe alternatives
165 import sys
167 if sys.platform == "win32":
168 # Replace common emoji/Unicode characters with ASCII equivalents
169 safe_msg = (
170 clean_msg.replace("🚀", "[ROCKET]")
171 .replace("📄", "[DOCUMENT]")
172 .replace("✅", "[CHECK]")
173 .replace("❌", "[CROSS]")
174 .replace("⚠️", "[WARNING]")
175 .replace("🔍", "[SEARCH]")
176 .replace("💾", "[SAVE]")
177 .replace("🔧", "[TOOL]")
178 .replace("📊", "[CHART]")
179 .replace("🎯", "[TARGET]")
180 .replace("⚙️", "[GEAR]")
181 .replace("⏱️", "[TIMER]")
182 .replace("⏭️", "[NEXT]")
183 .replace("🏗️", "[CONSTRUCTION]")
184 .replace("1️⃣", "[1]")
185 .replace("2️⃣", "[2]")
186 .replace("3️⃣", "[3]")
187 .replace("4️⃣", "[4]")
188 )
189 try:
190 stream.write(safe_msg + self.terminator)
191 except UnicodeEncodeError:
192 # Final fallback: encode with ASCII and replace all unsupported chars
193 ascii_msg = safe_msg.encode("ascii", errors="replace").decode(
194 "ascii"
195 )
196 stream.write(ascii_msg + self.terminator)
197 else:
198 # For non-Windows, try UTF-8 encoding with replacement
199 safe_msg = clean_msg.encode("utf-8", errors="replace").decode(
200 "utf-8", errors="replace"
201 )
202 stream.write(safe_msg + self.terminator)
203 self.flush()
204 except Exception:
205 self.handleError(record)
208class CleanFormatter(logging.Formatter):
209 """Custom formatter that shows only the message for INFO level logs."""
211 def __init__(self, use_custom_renderer=False):
212 super().__init__()
213 self.use_custom_renderer = use_custom_renderer
215 def format(self, record):
216 message = record.getMessage()
218 # If we're using the custom renderer, just return the message as-is
219 # since the CustomConsoleRenderer already handled the formatting
220 if self.use_custom_renderer:
221 return message
223 # For INFO level, just show the message
224 if record.levelno == logging.INFO:
225 return message
226 else:
227 # For other levels, we need to reorder timestamp and level
228 # Check if message starts with a timestamp (HH:MM:SS format)
229 # The message might contain ANSI color codes, so we need to account for that
230 time_pattern = (
231 r"^(?:\x1b\[[0-9;]*m)?(\d{2}:\d{2}:\d{2})(?:\x1b\[[0-9;]*m)?\s+(.*)"
232 )
233 match = re.match(time_pattern, message)
235 if match:
236 timestamp = match.group(1)
237 rest_of_message = match.group(2)
239 # Check if the rest_of_message already contains a level tag
240 # This can happen when CustomConsoleRenderer already formatted it
241 level_in_message_pattern = (
242 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s"
243 )
244 if re.match(level_in_message_pattern, rest_of_message):
245 # Level already present, don't add another one
246 return f"{timestamp} {rest_of_message}"
247 else:
248 # Add level tag
249 return f"{timestamp} [{record.levelname}] {rest_of_message}"
250 else:
251 # No timestamp found
252 # Check if message already has a level tag at the beginning
253 level_in_message_pattern = (
254 r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s"
255 )
256 if re.match(level_in_message_pattern, message):
257 # Level already present, don't add another one
258 return message
259 else:
260 # Add level tag
261 return f"[{record.levelname}] {message}"
264class FileRenderer:
265 """Custom renderer for file output without timestamps (FileFormatter will add them)."""
267 def __call__(self, logger, method_name, event_dict):
268 # Extract the main message
269 event = event_dict.pop("event", "")
271 # Format additional key-value pairs
272 if event_dict:
273 extras = " ".join(f"{k}={v}" for k, v in event_dict.items())
274 return f"{event} {extras}".strip()
275 else:
276 return event
279class FileFormatter(logging.Formatter):
280 """Custom formatter for file output that provides clean, readable logs without ANSI codes."""
282 # ANSI escape sequence pattern
283 ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
285 def format(self, record):
286 # Get the timestamp
287 timestamp = self.formatTime(record, "%Y-%m-%d %H:%M:%S")
289 # Get the level name
290 level = record.levelname
292 # Get the message (this will be the already formatted structlog message)
293 message = record.getMessage()
295 # First, strip ANSI escape sequences
296 clean_message = self.ANSI_ESCAPE.sub("", message)
298 # Now check if the clean message starts with a timestamp and remove it
299 # Pattern for structlog output: "HH:MM:SS message content"
300 time_pattern = r"^\d{2}:\d{2}:\d{2}\s+"
301 if re.match(time_pattern, clean_message):
302 # Remove the structlog timestamp since we're adding our own
303 clean_message = re.sub(time_pattern, "", clean_message)
305 # Check if the message already contains a level tag to avoid duplication
306 level_in_message_pattern = r"^\[(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)\]\s"
307 has_level_tag = re.match(level_in_message_pattern, clean_message)
309 # Format based on log level
310 if record.levelno == logging.INFO:
311 # For INFO level, use a clean format: timestamp | message
312 return f"{timestamp} | {clean_message}"
313 else:
314 # For other levels, include the level only if not already present
315 if has_level_tag:
316 return f"{timestamp} | {clean_message}"
317 else:
318 return f"{timestamp} | [{level}] {clean_message}"
321class CustomConsoleRenderer:
322 """Custom console renderer that formats timestamp and level correctly."""
324 def __init__(self, colors=True):
325 self.colors = colors
326 self._console_renderer = structlog.dev.ConsoleRenderer(colors=colors)
327 # ANSI color codes
328 self.gray = "\033[90m" if colors else ""
329 self.green = "\033[92m" if colors else "" # Bright green for INFO
330 self.yellow = "\033[93m" if colors else "" # Bright yellow for WARNING
331 self.red = "\033[91m" if colors else "" # Bright red for ERROR
332 self.magenta = "\033[95m" if colors else "" # Bright magenta for CRITICAL
333 self.cyan = "\033[96m" if colors else "" # Bright cyan for DEBUG
334 self.reset = "\033[0m" if colors else ""
336 def _get_level_color(self, level):
337 """Get the appropriate color for a log level."""
338 level_colors = {
339 "DEBUG": self.cyan,
340 "INFO": self.green, # Green for INFO
341 "WARNING": self.yellow,
342 "ERROR": self.red,
343 "CRITICAL": self.magenta,
344 }
345 return level_colors.get(level, "")
347 def __call__(self, logger, method_name, event_dict):
348 # Extract timestamp if present
349 timestamp = event_dict.pop("timestamp", None)
351 # Get the level from method_name
352 level = method_name.upper()
354 # Use the default console renderer to format the rest
355 formatted = self._console_renderer(logger, method_name, event_dict)
357 # If we have a timestamp
358 if timestamp and isinstance(timestamp, str) and len(timestamp) >= 8:
359 time_part = timestamp[:8] # Get HH:MM:SS part
361 # Remove the timestamp from the formatted message if it's there
362 if formatted.startswith(time_part):
363 formatted = formatted[len(time_part) :].lstrip()
365 # Add gray color to timestamp
366 colored_timestamp = f"{self.gray}{time_part}{self.reset}"
368 # Get colored level for all levels including INFO
369 level_color = self._get_level_color(level)
370 colored_level = (
371 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]"
372 )
374 # Show timestamp, colored level, and message for all levels
375 return f"{colored_timestamp} {colored_level} {formatted}"
377 # Fallback if no timestamp
378 level_color = self._get_level_color(level)
379 colored_level = (
380 f"{level_color}[{level}]{self.reset}" if level_color else f"[{level}]"
381 )
382 return f"{colored_level} {formatted}"
385class LoggingConfig:
386 """Centralized logging configuration."""
388 _initialized = False
389 _current_config = None
391 @classmethod
392 def setup(
393 cls,
394 level: str = "INFO",
395 format: str = "console",
396 file: str | None = None,
397 suppress_qdrant_warnings: bool = True,
398 clean_output: bool = True,
399 ) -> None:
400 """Setup logging configuration.
402 Args:
403 level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
404 format: Log format (json or console)
405 file: Path to log file (optional)
406 suppress_qdrant_warnings: Whether to suppress Qdrant version check warnings
407 clean_output: Whether to use clean, less verbose output
408 """
409 try:
410 # Convert string level to logging level
411 numeric_level = getattr(logging, level.upper())
412 except AttributeError:
413 raise ValueError(f"Invalid log level: {level}") from None
415 # Reset logging configuration
416 logging.getLogger().handlers = []
417 structlog.reset_defaults()
419 # Create a list of handlers
420 handlers = []
422 # Add console handler with Windows encoding support
424 # Use Windows-safe console handler for all platforms
425 console_handler = WindowsSafeConsoleHandler()
427 if clean_output and format == "console":
428 # Use clean formatter for console output
429 console_handler.setFormatter(CleanFormatter(use_custom_renderer=True))
430 else:
431 console_handler.setFormatter(logging.Formatter("%(message)s"))
433 console_handler.addFilter(ApplicationFilter()) # Only show our application logs
434 console_handler.addFilter(SQLiteFilter()) # Suppress verbose SQLite logs
436 if clean_output:
437 console_handler.addFilter(VerbosityFilter()) # Reduce verbosity
439 handlers.append(console_handler)
441 # Add file handler if file is configured
442 if file:
443 file_handler = CleanFileHandler(file)
444 file_handler.setFormatter(FileFormatter())
445 file_handler.addFilter(
446 SQLiteFilter()
447 ) # Suppress verbose SQLite logs in files too
448 # Don't apply verbosity filter to file logs - keep everything for debugging
449 handlers.append(file_handler)
451 # Configure standard logging
452 logging.basicConfig(
453 level=numeric_level,
454 format="%(message)s",
455 handlers=handlers,
456 )
458 # Apply SQLite filter to all existing loggers
459 root_logger = logging.getLogger()
460 for handler in root_logger.handlers:
461 handler.addFilter(SQLiteFilter())
463 # Add filter to suppress Qdrant version check warnings
464 if suppress_qdrant_warnings:
465 qdrant_logger = logging.getLogger("qdrant_client")
466 qdrant_logger.addFilter(QdrantVersionFilter())
468 # Suppress verbose SQLAlchemy and SQLite logs
469 sqlalchemy_loggers = [
470 "sqlalchemy.engine",
471 "sqlalchemy.dialects",
472 "sqlalchemy.pool",
473 "aiosqlite",
474 "sqlite3",
475 ]
477 for logger_name in sqlalchemy_loggers:
478 logger = logging.getLogger(logger_name)
479 logger.addFilter(SQLiteFilter())
480 logger.setLevel(logging.WARNING) # Only show warnings and errors
482 # Suppress verbose third-party library debug logs
483 noisy_loggers = [
484 "chardet", # Character encoding detection
485 "chardet.charsetprober",
486 "chardet.latin1prober",
487 "chardet.mbcharsetprober",
488 "chardet.sbcharsetprober",
489 "chardet.utf8prober",
490 "pdfminer", # PDF parsing
491 "pdfminer.pdfparser",
492 "pdfminer.pdfdocument",
493 "pdfminer.pdfinterp",
494 "pdfminer.converter",
495 "pdfplumber", # PDF processing
496 "markitdown", # File conversion
497 "httpcore", # HTTP client debug logs
498 "httpx", # HTTP client debug logs
499 "gensim", # Topic modeling library
500 "gensim.models", # LDA model training logs
501 "gensim.models.ldamodel", # Specific LDA training logs
502 "gensim.corpora", # Dictionary and corpus logs
503 ]
505 for logger_name in noisy_loggers:
506 logger = logging.getLogger(logger_name)
507 logger.setLevel(logging.WARNING) # Only show warnings and errors
509 # Configure structlog processors based on format and clean_output
510 if clean_output and format == "console":
511 # Minimal processors for clean output
512 processors = [
513 structlog.stdlib.filter_by_level,
514 structlog.processors.TimeStamper(fmt="%H:%M:%S"),
515 CustomConsoleRenderer(colors=True),
516 ]
517 else:
518 # Full processors for detailed output
519 processors = [
520 structlog.stdlib.filter_by_level,
521 structlog.stdlib.add_logger_name,
522 structlog.stdlib.add_log_level,
523 structlog.processors.TimeStamper(fmt="iso"),
524 structlog.processors.StackInfoRenderer(),
525 structlog.processors.UnicodeDecoder(),
526 structlog.processors.CallsiteParameterAdder(
527 [
528 structlog.processors.CallsiteParameter.FILENAME,
529 structlog.processors.CallsiteParameter.FUNC_NAME,
530 structlog.processors.CallsiteParameter.LINENO,
531 ]
532 ),
533 ]
535 if format == "json":
536 processors.append(structlog.processors.JSONRenderer())
537 else:
538 processors.append(structlog.dev.ConsoleRenderer(colors=True))
540 # Configure structlog
541 structlog.configure(
542 processors=processors,
543 wrapper_class=structlog.make_filtering_bound_logger(numeric_level),
544 logger_factory=structlog.stdlib.LoggerFactory(),
545 cache_logger_on_first_use=False, # Disable caching to ensure new configuration is used
546 )
548 cls._initialized = True
549 cls._current_config = (
550 level,
551 format,
552 file,
553 suppress_qdrant_warnings,
554 clean_output,
555 )
557 @classmethod
558 def get_logger(cls, name: str | None = None) -> structlog.BoundLogger:
559 """Get a logger instance.
561 Args:
562 name: Logger name. If None, will use the calling module's name.
564 Returns:
565 structlog.BoundLogger: Logger instance
566 """
567 if not cls._initialized:
568 # Initialize with default settings if not already initialized
569 cls.setup()
570 return structlog.get_logger(name)