Coverage for src/qdrant_loader/core/worker/scheduler.py: 91%

1from __future__ import annotations

3import asyncio

4import json

5import time

6from collections.abc import Callable

8from qdrant_loader.config.models import ProjectsConfig

9from qdrant_loader.config.workers import IncrementalPullScheduleConfig

10from qdrant_loader.core.worker.job_types import JobType

11from qdrant_loader.core.worker.queue import JobQueue

12from qdrant_loader.utils.logging import LoggingConfig

14logger = LoggingConfig.get_logger(__name__)

17class IncrementalPullScheduler:

18 """Create periodic INCREMENTAL_PULL jobs using a monotonic clock."""

20 SOURCE_TYPES = ("publicdocs", "git", "confluence", "jira", "localfile")

22 def __init__(

23 self,

24 queue: JobQueue,

25 projects_config: ProjectsConfig,

26 schedule: IncrementalPullScheduleConfig,

27 monotonic: Callable[[], float] | None = None,

28 ) -> None:

29 self._queue = queue

30 self._projects_config = projects_config

31 self._schedule = schedule

32 self._monotonic = monotonic or time.monotonic

34 async def run(self, stop_event: asyncio.Event) -> None:

35 """Run periodic scheduling loop until stop_event is set."""

36 if not self._schedule.enabled:

37 logger.info("scheduler.incremental_pull.disabled")

38 return

40 interval = float(self._schedule.interval_seconds)

41 # First scheduling pass should happen immediately on service startup.

42 # Subsequent passes run at the configured interval.

43 next_run_at = self._monotonic()

45 logger.info(

46 "scheduler.incremental_pull.started",

47 interval_seconds=self._schedule.interval_seconds,

48 )

50 while not stop_event.is_set():

51 now = self._monotonic()

52 if now >= next_run_at:

53 created = await self.run_once()

54 logger.info("scheduler.incremental_pull.tick", created=created)

56 while next_run_at <= now:

57 next_run_at += interval

59 timeout = max(0.0, min(1.0, next_run_at - self._monotonic()))

60 try:

61 await asyncio.wait_for(stop_event.wait(), timeout=timeout)

62 except TimeoutError:

63 pass

65 logger.info("scheduler.incremental_pull.stopped")

67 async def run_once(self) -> int:

68 """Attempt to enqueue all due INCREMENTAL_PULL jobs once."""

69 if not self._schedule.enabled:

70 return 0

72 dedup_keys = await self._load_active_dedup_keys()

73 created = 0

75 for project_id, source_type, source_name in self._iter_project_sources():

76 dedup_key = (

77 JobType.INCREMENTAL_PULL.value,

78 project_id,

79 source_type,

80 source_name,

81 )

82 if dedup_key in dedup_keys:

83 continue

85 payload = dict(self._schedule.payload_defaults)

86 payload.update(

87 {

88 "project_id": project_id,

89 "source_type": source_type,

90 "source": source_name,

91 "source_lock": f"{project_id}:{source_type}:{source_name}",

92 "force": False,

93 }

94 )

96 await self._queue.enqueue(JobType.INCREMENTAL_PULL, payload)

97 dedup_keys.add(dedup_key)

98 created += 1

100 return created

101

102 def _iter_project_sources(self):

103 for project in self._projects_config.projects.values():

104 for source_type in self.SOURCE_TYPES:

105 source_map = getattr(project.sources, source_type, {}) or {}

106 for source_name in source_map.keys():

107 yield project.project_id, source_type, source_name

108

109 async def _load_active_dedup_keys(self) -> set[tuple[str, str, str, str]]:

110 """Load all active (non-terminal) job dedup keys to avoid re-enqueueing.

111

112 Paginates through each configured status (dedup_statuses) to ensure all

113 active jobs are accounted for, even when a status has >10k entries.

114 Scaling: 100k PENDING jobs → ~100 queries of 1k each, O(n) memory constant.

115

116 **Offset pagination caveat:** If new jobs are enqueued during pagination

117 (by other workers or the pool), rows can shift and be skipped. This is

118 acceptable because:

119 1. Unlikely: pagination completes in <100ms for 10k jobs; concurrent

120 enqueues during this window are rare.

121 2. Dedup only checks jobs active at the START of run_once(). Jobs

122 enqueued mid-pagination are not yet in dedup_keys anyway.

123 3. Worst case: duplicate enqueue for a source, caught by downstream

124 dedup logic or visibility lease enforcement.

125

126 Future: Consider keyset pagination (WHERE (enqueued_at, id) > ...) if

127 this risk becomes unacceptable.

128

129 Returns:

130 Set of (job_type, project_id, source_type, source_name) tuples.

131 """

132 keys: set[tuple[str, str, str, str]] = set()

133

134 # JobQueue protocol only supports filtering by one status at a time.

135 # Paginate through each configured status to avoid missing jobs when

136 # a status has >limit entries. Continue until list returns < limit results.

137 for status in self._schedule.dedup_statuses:

138 offset = 0

139 page_size = 1000

140 while True:

141 jobs = await self._queue.list(

142 status=status, limit=page_size, offset=offset

143 )

144 if not jobs:

145 break

146 for job in jobs:

147 key = self._job_dedup_key(job)

148 if key is not None:

149 keys.add(key)

150 if len(jobs) < page_size:

151 break

152 offset += page_size

153

154 return keys

155

156 @staticmethod

157 def _job_dedup_key(job) -> tuple[str, str, str, str] | None:

158 if getattr(job, "type", None) != JobType.INCREMENTAL_PULL.value:

159 return None

160

161 try:

162 payload = json.loads(job.payload_json)

163 except (TypeError, ValueError, json.JSONDecodeError):

164 return None

165

166 if not isinstance(payload, dict):

167 return None

168

169 project_id = payload.get("project_id")

170 source_type = payload.get("source_type")

171 source_name = payload.get("source")

172 if not all(

173 isinstance(v, str) and v.strip()

174 for v in (project_id, source_type, source_name)

175 ):

176 return None

177

178 return (

179 "INCREMENTAL_PULL",

180 project_id.strip(),

181 source_type.strip(),

182 source_name.strip(),

183 )