Coverage for src/qdrant_loader/connectors/publicdocs/config.py: 95%
21 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Configuration for Public Documentation connector."""
3from pydantic import BaseModel, Field, field_validator
5from qdrant_loader.config.source_config import SourceConfig
8class SelectorsConfig(BaseModel):
9 """Configuration for HTML content extraction selectors."""
11 content: str = Field(
12 default="article, main, .content", description="Main content container selector"
13 )
14 remove: list[str] = Field(
15 default=["nav", "header", "footer", ".sidebar"],
16 description="Elements to remove from the content",
17 )
18 code_blocks: str = Field(default="pre code", description="Code blocks selector")
21class PublicDocsSourceConfig(SourceConfig):
22 """Configuration for a single public documentation source."""
24 version: str = Field(
25 ..., description="Specific version of the documentation to fetch"
26 )
27 content_type: str = Field(
28 default="html", description="Content type of the documentation"
29 )
30 path_pattern: str | None = Field(
31 default=None, description="Specific path pattern to match documentation pages"
32 )
33 exclude_paths: list[str] = Field(
34 default=[], description="List of paths to exclude from processing"
35 )
36 selectors: SelectorsConfig = Field(
37 default_factory=SelectorsConfig,
38 description="CSS selectors for content extraction",
39 )
41 # Attachment handling
42 download_attachments: bool = Field(
43 default=False,
44 description="Whether to download and process linked files (PDFs, docs, etc.)",
45 )
46 attachment_selectors: list[str] = Field(
47 default=[
48 "a[href$='.pdf']",
49 "a[href$='.doc']",
50 "a[href$='.docx']",
51 "a[href$='.xls']",
52 "a[href$='.xlsx']",
53 "a[href$='.ppt']",
54 "a[href$='.pptx']",
55 ],
56 description="CSS selectors for finding downloadable attachments",
57 )
59 @field_validator("content_type")
60 @classmethod
61 def validate_content_type(cls, v: str) -> str:
62 """Validate content type."""
63 valid_types = ["html", "markdown", "rst"]
64 if v.lower() not in valid_types:
65 raise ValueError(f"Content type must be one of {valid_types}")
66 return v.lower()