Coverage for src/qdrant_loader/connectors/publicdocs/config.py: 95%

21 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Configuration for Public Documentation connector.""" 

2 

3from pydantic import BaseModel, Field, field_validator 

4 

5from qdrant_loader.config.source_config import SourceConfig 

6 

7 

8class SelectorsConfig(BaseModel): 

9 """Configuration for HTML content extraction selectors.""" 

10 

11 content: str = Field( 

12 default="article, main, .content", description="Main content container selector" 

13 ) 

14 remove: list[str] = Field( 

15 default=["nav", "header", "footer", ".sidebar"], 

16 description="Elements to remove from the content", 

17 ) 

18 code_blocks: str = Field(default="pre code", description="Code blocks selector") 

19 

20 

21class PublicDocsSourceConfig(SourceConfig): 

22 """Configuration for a single public documentation source.""" 

23 

24 version: str = Field( 

25 ..., description="Specific version of the documentation to fetch" 

26 ) 

27 content_type: str = Field( 

28 default="html", description="Content type of the documentation" 

29 ) 

30 path_pattern: str | None = Field( 

31 default=None, description="Specific path pattern to match documentation pages" 

32 ) 

33 exclude_paths: list[str] = Field( 

34 default=[], description="List of paths to exclude from processing" 

35 ) 

36 selectors: SelectorsConfig = Field( 

37 default_factory=SelectorsConfig, 

38 description="CSS selectors for content extraction", 

39 ) 

40 

41 # Attachment handling 

42 download_attachments: bool = Field( 

43 default=False, 

44 description="Whether to download and process linked files (PDFs, docs, etc.)", 

45 ) 

46 attachment_selectors: list[str] = Field( 

47 default=[ 

48 "a[href$='.pdf']", 

49 "a[href$='.doc']", 

50 "a[href$='.docx']", 

51 "a[href$='.xls']", 

52 "a[href$='.xlsx']", 

53 "a[href$='.ppt']", 

54 "a[href$='.pptx']", 

55 ], 

56 description="CSS selectors for finding downloadable attachments", 

57 ) 

58 

59 @field_validator("content_type") 

60 @classmethod 

61 def validate_content_type(cls, v: str) -> str: 

62 """Validate content type.""" 

63 valid_types = ["html", "markdown", "rst"] 

64 if v.lower() not in valid_types: 

65 raise ValueError(f"Content type must be one of {valid_types}") 

66 return v.lower()