Coverage for src/qdrant_loader/connectors/publicdocs/config.py: 100%

1"""Configuration for Public Documentation connector."""

3from pydantic import BaseModel, Field, field_validator

5from qdrant_loader.config.source_config import SourceConfig

8class SelectorsConfig(BaseModel):

9 """Configuration for HTML content extraction selectors."""

11 content: str = Field(

12 default="article, main, .content", description="Main content container selector"

13 )

14 remove: list[str] = Field(

15 default=["nav", "header", "footer", ".sidebar"],

16 description="Elements to remove from the content",

17 )

18 code_blocks: str = Field(default="pre code", description="Code blocks selector")

21class PublicDocsSourceConfig(SourceConfig):

22 """Configuration for a single public documentation source."""

24 version: str = Field(

25 ..., description="Specific version of the documentation to fetch"

26 )

27 content_type: str = Field(

28 default="html", description="Content type of the documentation"

29 )

30 path_pattern: str | None = Field(

31 default=None, description="Specific path pattern to match documentation pages"

32 )

33 exclude_paths: list[str] = Field(

34 default=[], description="List of paths to exclude from processing"

35 )

36 selectors: SelectorsConfig = Field(

37 default_factory=SelectorsConfig,

38 description="CSS selectors for content extraction",

39 )

41 # Attachment handling

42 download_attachments: bool = Field(

43 default=False,

44 description="Whether to download and process linked files (PDFs, docs, etc.)",

45 )

46 attachment_selectors: list[str] = Field(

47 default=[

48 "a[href$='.pdf']",

49 "a[href$='.doc']",

50 "a[href$='.docx']",

51 "a[href$='.xls']",

52 "a[href$='.xlsx']",

53 "a[href$='.ppt']",

54 "a[href$='.pptx']",

55 ],

56 description="CSS selectors for finding downloadable attachments",

57 )

59 # Rate limiting

60 requests_per_minute: int = Field(

61 default=120,

62 description="Maximum number of requests per minute for crawling",

63 ge=1,

64 le=2000,

65 )

67 @field_validator("content_type")

68 @classmethod

69 def validate_content_type(cls, v: str) -> str:

70 """Validate content type."""

71 valid_types = ["html", "markdown", "rst"]

72 if v.lower() not in valid_types:

73 raise ValueError(f"Content type must be one of {valid_types}")

74 return v.lower()

Coverage for src / qdrant_loader / connectors / publicdocs / config.py: 100%

22 statements