Coverage for src/qdrant_loader/config/global_config.py: 100%

28 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:39 +0000

1"""Global configuration settings. 

2 

3This module defines the global configuration settings that apply across the application, 

4including chunking, embedding, and logging configurations. 

5""" 

6 

7from typing import Any 

8 

9from pydantic import Field 

10 

11from qdrant_loader.config.base import BaseConfig 

12from qdrant_loader.config.chunking import ChunkingConfig 

13from qdrant_loader.config.embedding import EmbeddingConfig 

14from qdrant_loader.config.qdrant import QdrantConfig 

15from qdrant_loader.config.sources import SourcesConfig 

16from qdrant_loader.config.state import StateManagementConfig 

17from qdrant_loader.core.file_conversion import FileConversionConfig 

18 

19 

20class SemanticAnalysisConfig(BaseConfig): 

21 """Configuration for semantic analysis.""" 

22 

23 num_topics: int = Field( 

24 default=3, description="Number of topics to extract using LDA" 

25 ) 

26 

27 lda_passes: int = Field(default=10, description="Number of passes for LDA training") 

28 

29 spacy_model: str = Field( 

30 default="en_core_web_md", 

31 description="spaCy model to use for text processing. Options: en_core_web_sm (15MB, no vectors), en_core_web_md (50MB, 20k vectors), en_core_web_lg (750MB, 514k vectors)" 

32 ) 

33 

34 

35class GlobalConfig(BaseConfig): 

36 """Global configuration settings.""" 

37 

38 chunking: ChunkingConfig = Field(default_factory=ChunkingConfig) 

39 embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig) 

40 semantic_analysis: SemanticAnalysisConfig = Field( 

41 default_factory=SemanticAnalysisConfig, 

42 description="Semantic analysis configuration", 

43 ) 

44 state_management: StateManagementConfig = Field( 

45 default_factory=lambda: StateManagementConfig(database_path=":memory:"), 

46 description="State management configuration", 

47 ) 

48 sources: SourcesConfig = Field(default_factory=SourcesConfig) 

49 file_conversion: FileConversionConfig = Field( 

50 default_factory=FileConversionConfig, 

51 description="File conversion configuration", 

52 ) 

53 qdrant: QdrantConfig | None = Field( 

54 default=None, description="Qdrant configuration" 

55 ) 

56 

57 def __init__(self, **data): 

58 """Initialize global configuration.""" 

59 # If skip_validation is True and no state_management is provided, use in-memory database 

60 skip_validation = data.pop("skip_validation", False) 

61 if skip_validation and "state_management" not in data: 

62 data["state_management"] = { 

63 "database_path": ":memory:", 

64 "table_prefix": "qdrant_loader_", 

65 "connection_pool": {"size": 5, "timeout": 30}, 

66 } 

67 super().__init__(**data) 

68 

69 def to_dict(self) -> dict[str, Any]: 

70 """Convert the configuration to a dictionary.""" 

71 return { 

72 "chunking": { 

73 "chunk_size": self.chunking.chunk_size, 

74 "chunk_overlap": self.chunking.chunk_overlap, 

75 }, 

76 "embedding": self.embedding.model_dump(), 

77 "semantic_analysis": { 

78 "num_topics": self.semantic_analysis.num_topics, 

79 "lda_passes": self.semantic_analysis.lda_passes, 

80 "spacy_model": self.semantic_analysis.spacy_model, 

81 }, 

82 "sources": self.sources.to_dict(), 

83 "state_management": self.state_management.to_dict(), 

84 "file_conversion": { 

85 "max_file_size": self.file_conversion.max_file_size, 

86 "conversion_timeout": self.file_conversion.conversion_timeout, 

87 "markitdown": { 

88 "enable_llm_descriptions": self.file_conversion.markitdown.enable_llm_descriptions, 

89 "llm_model": self.file_conversion.markitdown.llm_model, 

90 "llm_endpoint": self.file_conversion.markitdown.llm_endpoint, 

91 "llm_api_key": self.file_conversion.markitdown.llm_api_key, 

92 }, 

93 }, 

94 "qdrant": self.qdrant.to_dict() if self.qdrant else None, 

95 }