Coverage for src/qdrant_loader_mcp_server/search/enhanced/cdi/finders.py: 96%

271 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1""" 

2Complementary Content Discovery for Cross-Document Intelligence. 

3 

4This module implements advanced complementary content discovery that identifies 

5documents which enhance understanding of a target document through 

6requirements-implementation chains, abstraction gaps, and cross-functional relationships. 

7""" 

8 

9from __future__ import annotations 

10 

11import time 

12from typing import TYPE_CHECKING 

13 

14if TYPE_CHECKING: 

15 pass 

16 

17from ....utils.logging import LoggingConfig 

18from ...models import SearchResult 

19from .extractors.similarity_helpers import ( 

20 get_shared_entities_count as cdi_get_shared_entities_count, 

21) 

22from .extractors.similarity_helpers import ( 

23 get_shared_technologies_count as cdi_get_shared_technologies_count, 

24) 

25from .extractors.similarity_helpers import ( 

26 get_shared_topics_count as cdi_get_shared_topics_count, 

27) 

28from .extractors.similarity_helpers import ( 

29 has_reusable_architecture_patterns as cdi_has_reusable_architecture_patterns, 

30) 

31from .extractors.similarity_helpers import ( 

32 has_shared_entities as cdi_has_shared_entities, 

33) 

34from .extractors.similarity_helpers import ( 

35 has_shared_technologies as cdi_has_shared_technologies, 

36) 

37from .extractors.similarity_helpers import has_shared_topics as cdi_has_shared_topics 

38from .extractors.similarity_helpers import ( 

39 has_transferable_domain_knowledge as cdi_has_transferable_domain_knowledge, 

40) 

41from .models import ComplementaryContent 

42 

43logger = LoggingConfig.get_logger(__name__) 

44 

45 

46class ComplementaryContentFinder: 

47 """Finds complementary content that would enhance understanding of a target document.""" 

48 

49 def __init__( 

50 self, 

51 similarity_calculator, 

52 knowledge_graph=None, 

53 ): 

54 """Initialize the complementary content finder.""" 

55 self.similarity_calculator = similarity_calculator 

56 self.knowledge_graph = knowledge_graph 

57 self.logger = LoggingConfig.get_logger(__name__) 

58 

59 def find_complementary_content( 

60 self, 

61 target_doc, 

62 candidate_docs, 

63 max_recommendations: int = 5, 

64 ) -> ComplementaryContent: 

65 """Find complementary content for a target document.""" 

66 start_time = time.time() 

67 

68 recommendations = [] 

69 target_doc_id = f"{target_doc.source_type}:{target_doc.source_title}" 

70 

71 self.logger.info(f"Finding complementary content for target: {target_doc_id}") 

72 self.logger.info(f"Target doc topics: {target_doc.topics}") 

73 self.logger.info(f"Target doc entities: {target_doc.entities}") 

74 self.logger.info(f"Analyzing {len(candidate_docs)} candidate documents") 

75 

76 for candidate in candidate_docs: 

77 candidate_id = f"{candidate.source_type}:{candidate.source_title}" 

78 

79 if candidate_id == target_doc_id: 

80 continue 

81 

82 # Consolidated candidate analysis debug (reduces verbosity) 

83 self.logger.debug( 

84 "Analyzing candidate", 

85 candidate_id=candidate_id, 

86 topics_count=len(candidate.topics), 

87 entities_count=len(candidate.entities), 

88 ) 

89 

90 # Calculate complementary score 

91 complementary_score, reason = self._calculate_complementary_score( 

92 target_doc, candidate 

93 ) 

94 

95 self.logger.info( 

96 f"Complementary score for {candidate_id}: {complementary_score:.3f} - {reason}" 

97 ) 

98 

99 if ( 

100 complementary_score > 0.15 

101 ): # Lowered threshold for complementary content 

102 recommendations.append((candidate_id, complementary_score, reason)) 

103 else: 

104 # Log why it didn't make the cut 

105 self.logger.debug( 

106 f"Rejected {candidate_id}: score {complementary_score:.3f} below threshold 0.15" 

107 ) 

108 

109 # Sort by complementary score 

110 recommendations.sort(key=lambda x: x[1], reverse=True) 

111 

112 processing_time = (time.time() - start_time) * 1000 

113 self.logger.info( 

114 f"Found {len(recommendations)} complementary recommendations in {processing_time:.2f}ms" 

115 ) 

116 

117 return ComplementaryContent( 

118 target_doc_id=target_doc_id, 

119 recommendations=recommendations[:max_recommendations], 

120 recommendation_strategy="mixed", 

121 ) 

122 

123 def _calculate_complementary_score( 

124 self, target_doc, candidate_doc 

125 ) -> tuple[float, str]: 

126 """Calculate how complementary a candidate document is to the target. 

127 

128 Redesigned algorithm that prioritizes intra-project relationships while 

129 maintaining intelligent inter-project discovery capabilities. 

130 """ 

131 self.logger.info( 

132 f"=== Scoring {candidate_doc.source_title} against {target_doc.source_title} ===" 

133 ) 

134 

135 same_project = target_doc.project_id == candidate_doc.project_id 

136 self.logger.info( 

137 f"Project context: target={target_doc.project_id}, candidate={candidate_doc.project_id}, same_project={same_project}" 

138 ) 

139 

140 if same_project: 

141 # Prioritize intra-project relationships 

142 score, reason = self._score_intra_project_complementary( 

143 target_doc, candidate_doc 

144 ) 

145 

146 # Boost for high topic relevance within project 

147 if score > 0 and self._has_high_topic_overlap(target_doc, candidate_doc): 

148 boosted_score = min(0.95, score * 1.2) 

149 self.logger.info( 

150 f"✓ Intra-project topic boost: {score:.3f}{boosted_score:.3f}" 

151 ) 

152 score = boosted_score 

153 reason = f"{reason} (high topic relevance)" 

154 

155 else: 

156 # Evaluate inter-project relationships 

157 score, reason = self._score_inter_project_complementary( 

158 target_doc, candidate_doc 

159 ) 

160 

161 # Apply cross-project penalty (inter-project content is less immediately useful) 

162 if score > 0: 

163 adjusted_score = score * 0.8 

164 self.logger.info( 

165 f"✓ Inter-project penalty applied: {score:.3f}{adjusted_score:.3f}" 

166 ) 

167 score = adjusted_score 

168 reason = f"Inter-project: {reason}" 

169 

170 self.logger.info( 

171 f"Final complementary score: {score:.3f} for {candidate_doc.source_title} - {reason}" 

172 ) 

173 return score, reason 

174 

175 def _score_intra_project_complementary( 

176 self, target_doc, candidate_doc 

177 ) -> tuple[float, str]: 

178 """Score complementary relationships within the same project.""" 

179 factors = [] 

180 

181 # A. Requirements ↔ Implementation Chain 

182 if self._is_requirements_implementation_pair(target_doc, candidate_doc): 

183 factors.append((0.85, "requirements-implementation")) 

184 self.logger.info("✓ Found requirements-implementation pair") 

185 

186 # B. Abstraction Level Differences 

187 abstraction_gap = self._calculate_abstraction_gap(target_doc, candidate_doc) 

188 if abstraction_gap > 0: 

189 score = 0.7 + (abstraction_gap * 0.1) 

190 factors.append( 

191 (score, f"Different abstraction levels (gap: {abstraction_gap})") 

192 ) 

193 self.logger.info( 

194 f"✓ Abstraction gap: {abstraction_gap} → score: {score:.3f}" 

195 ) 

196 

197 # C. Cross-Functional Perspectives 

198 if self._has_cross_functional_relationship(target_doc, candidate_doc): 

199 factors.append((0.75, "Cross-functional perspectives")) 

200 self.logger.info("✓ Cross-functional relationship detected") 

201 

202 # D. Topic Overlap with Different Document Types 

203 if self._has_shared_topics( 

204 target_doc, candidate_doc 

205 ) and self._has_different_document_types(target_doc, candidate_doc): 

206 shared_topics = self._get_shared_topics_count(target_doc, candidate_doc) 

207 score = min(0.65, 0.35 + (shared_topics * 0.1)) 

208 factors.append( 

209 ( 

210 score, 

211 f"Same topics, different document types ({shared_topics} topics)", 

212 ) 

213 ) 

214 self.logger.info(f"✓ Topic overlap with different doc types: {score:.3f}") 

215 

216 return self._calculate_weighted_score(factors, target_doc, candidate_doc) 

217 

218 def _score_inter_project_complementary( 

219 self, target_doc, candidate_doc 

220 ) -> tuple[float, str]: 

221 """Score complementary relationships between different projects.""" 

222 factors = [] 

223 

224 # A. Similar Challenges/Solutions 

225 if self._has_similar_challenges(target_doc, candidate_doc): 

226 factors.append((0.8, "Similar challenges/solutions")) 

227 self.logger.info("✓ Similar challenges detected") 

228 

229 # B. Domain Expertise Transfer 

230 if self._has_transferable_domain_knowledge(target_doc, candidate_doc): 

231 factors.append((0.75, "Transferable domain knowledge")) 

232 self.logger.info("✓ Transferable domain knowledge") 

233 

234 # C. Architectural Patterns 

235 if self._has_reusable_architecture_patterns(target_doc, candidate_doc): 

236 factors.append((0.7, "Reusable architecture patterns")) 

237 self.logger.info("✓ Architecture patterns detected") 

238 

239 # D. Shared Technologies/Standards 

240 if self._has_shared_technologies(target_doc, candidate_doc): 

241 shared_count = self._get_shared_technologies_count( 

242 target_doc, candidate_doc 

243 ) 

244 score = min(0.6, 0.3 + (shared_count * 0.1)) 

245 factors.append((score, f"Shared technologies ({shared_count} common)")) 

246 self.logger.info(f"✓ Shared technologies: {score:.3f}") 

247 

248 return self._calculate_weighted_score(factors, target_doc, candidate_doc) 

249 

250 def _calculate_weighted_score( 

251 self, 

252 factors: list[tuple[float, str]], 

253 target_doc=None, 

254 candidate_doc=None, 

255 ) -> tuple[float, str]: 

256 """Calculate weighted score from multiple factors.""" 

257 if not factors: 

258 if target_doc and candidate_doc: 

259 return self._enhanced_fallback_scoring(target_doc, candidate_doc) 

260 else: 

261 return 0.0, "No complementary relationship found" 

262 

263 # Sort factors by score but give priority to requirements-implementation relationships 

264 factors.sort(key=lambda x: x[0], reverse=True) 

265 

266 # Check for high-priority relationships first 

267 for score, reason in factors: 

268 if "requirements-implementation" in reason.lower(): 

269 # Requirements-implementation pairs get priority 

270 if len(factors) > 1: 

271 secondary_boost = sum(s for s, r in factors if r != reason) * 0.1 

272 final_score = min(0.95, score + secondary_boost) 

273 primary_reason = f"{reason} (+{len(factors)-1} other factors)" 

274 else: 

275 final_score = score 

276 primary_reason = reason 

277 return final_score, primary_reason 

278 

279 # Use the highest scoring factor as primary 

280 primary_score, primary_reason = factors[0] 

281 

282 # Boost if multiple factors contribute 

283 if len(factors) > 1: 

284 secondary_boost = sum(score for score, _ in factors[1:]) * 0.1 

285 final_score = min(0.95, primary_score + secondary_boost) 

286 primary_reason = f"{primary_reason} (+{len(factors)-1} other factors)" 

287 else: 

288 final_score = primary_score 

289 

290 return final_score, primary_reason 

291 

292 def _is_requirements_implementation_pair(self, doc1, doc2) -> bool: 

293 """Detect if documents form a requirements -> implementation chain.""" 

294 req_keywords = [ 

295 "requirements", 

296 "specification", 

297 "user story", 

298 "feature", 

299 "functional", 

300 ] 

301 impl_keywords = [ 

302 "implementation", 

303 "technical", 

304 "architecture", 

305 "api", 

306 "code", 

307 "development", 

308 ] 

309 

310 title1 = doc1.source_title.lower() 

311 title2 = doc2.source_title.lower() 

312 

313 doc1_is_req = any(keyword in title1 for keyword in req_keywords) 

314 doc1_is_impl = any(keyword in title1 for keyword in impl_keywords) 

315 doc2_is_req = any(keyword in title2 for keyword in req_keywords) 

316 doc2_is_impl = any(keyword in title2 for keyword in impl_keywords) 

317 

318 # One is requirements, other is implementation 

319 is_req_impl_pair = (doc1_is_req and doc2_is_impl) or ( 

320 doc1_is_impl and doc2_is_req 

321 ) 

322 

323 if not is_req_impl_pair: 

324 return False 

325 

326 # For same-project documents, we don't require shared topics/entities 

327 # as the project context already provides relationship 

328 same_project = ( 

329 getattr(doc1, "project_id", None) == getattr(doc2, "project_id", None) 

330 and getattr(doc1, "project_id", None) is not None 

331 ) 

332 

333 if same_project: 

334 return True 

335 

336 # For different projects, require some shared context 

337 return self._has_shared_topics(doc1, doc2) or self._has_shared_entities( 

338 doc1, doc2 

339 ) 

340 

341 def _calculate_abstraction_gap(self, doc1: SearchResult, doc2: SearchResult) -> int: 

342 """Calculate difference in abstraction levels (0-3). 

343 0: Same level, 3: Maximum gap (e.g., epic vs implementation detail) 

344 """ 

345 level1 = self._get_abstraction_level(doc1) 

346 level2 = self._get_abstraction_level(doc2) 

347 return abs(level1 - level2) 

348 

349 def _get_abstraction_level(self, doc: SearchResult) -> int: 

350 """Determine abstraction level of document (0=highest, 3=lowest).""" 

351 title = doc.source_title.lower() 

352 

353 # Level 0: High-level business/strategy 

354 if any( 

355 keyword in title 

356 for keyword in [ 

357 "strategy", 

358 "vision", 

359 "overview", 

360 "executive", 

361 "business case", 

362 ] 

363 ): 

364 return 0 

365 

366 # Level 1: Requirements/features 

367 if any( 

368 keyword in title 

369 for keyword in [ 

370 "requirements", 

371 "features", 

372 "user story", 

373 "epic", 

374 "specification", 

375 ] 

376 ): 

377 return 1 

378 

379 # Level 2: Design/architecture 

380 if any( 

381 keyword in title 

382 for keyword in [ 

383 "design", 

384 "architecture", 

385 "workflow", 

386 "process", 

387 "wireframe", 

388 ] 

389 ): 

390 return 2 

391 

392 # Level 3: Implementation details 

393 if any( 

394 keyword in title 

395 for keyword in [ 

396 "implementation", 

397 "code", 

398 "api", 

399 "technical", 

400 "development", 

401 "configuration", 

402 ] 

403 ): 

404 return 3 

405 

406 # Default to middle level 

407 return 2 

408 

409 def _has_cross_functional_relationship( 

410 self, doc1: SearchResult, doc2: SearchResult 

411 ) -> bool: 

412 """Detect business + technical, feature + security, etc.""" 

413 business_keywords = [ 

414 "business", 

415 "user", 

416 "requirements", 

417 "workflow", 

418 "process", 

419 "feature", 

420 ] 

421 technical_keywords = [ 

422 "technical", 

423 "architecture", 

424 "api", 

425 "implementation", 

426 "code", 

427 "development", 

428 ] 

429 security_keywords = [ 

430 "security", 

431 "authentication", 

432 "authorization", 

433 "compliance", 

434 "audit", 

435 ] 

436 

437 title1 = doc1.source_title.lower() 

438 title2 = doc2.source_title.lower() 

439 

440 # Business + Technical 

441 if ( 

442 any(k in title1 for k in business_keywords) 

443 and any(k in title2 for k in technical_keywords) 

444 ) or ( 

445 any(k in title2 for k in business_keywords) 

446 and any(k in title1 for k in technical_keywords) 

447 ): 

448 return True 

449 

450 # Feature + Security 

451 if ( 

452 any(k in title1 for k in ["feature", "functionality"]) 

453 and any(k in title2 for k in security_keywords) 

454 ) or ( 

455 any(k in title2 for k in ["feature", "functionality"]) 

456 and any(k in title1 for k in security_keywords) 

457 ): 

458 return True 

459 

460 return False 

461 

462 def _has_different_document_types(self, doc1, doc2) -> bool: 

463 """Check if documents are of different types based on content and title.""" 

464 type1 = self._classify_document_type(doc1) 

465 type2 = self._classify_document_type(doc2) 

466 return type1 != type2 

467 

468 def _classify_document_type(self, doc) -> str: 

469 """Classify document as: user_story, technical_spec, architecture, compliance, testing, etc.""" 

470 title = doc.source_title.lower() 

471 

472 # Check more specific categories first to avoid conflicts 

473 if any( 

474 keyword in title 

475 for keyword in ["security", "compliance", "audit", "policy"] 

476 ): 

477 return "compliance" 

478 elif any(keyword in title for keyword in ["test", "testing", "qa", "quality"]): 

479 return "testing" 

480 elif any(keyword in title for keyword in ["tutorial", "how-to", "walkthrough"]): 

481 return "tutorial" 

482 elif any(keyword in title for keyword in ["reference", "manual"]): 

483 return "reference" 

484 elif any(keyword in title for keyword in ["example", "sample", "demo"]): 

485 return "example" 

486 elif any(keyword in title for keyword in ["user story", "epic", "feature"]): 

487 return "user_story" 

488 elif any( 

489 keyword in title 

490 for keyword in ["technical", "specification", "api", "implementation"] 

491 ): 

492 return "technical_spec" 

493 elif any(keyword in title for keyword in ["architecture", "design", "system"]): 

494 return "architecture" 

495 elif any( 

496 keyword in title 

497 for keyword in ["workflow", "process", "procedure", "guide"] 

498 ): 

499 return "process" 

500 elif any( 

501 keyword in title for keyword in ["requirement"] 

502 ): # More general, check last 

503 return "user_story" 

504 else: 

505 return "general" 

506 

507 def _has_high_topic_overlap(self, doc1: SearchResult, doc2: SearchResult) -> bool: 

508 """Check if documents have high topic overlap (>= 3 shared topics).""" 

509 return self._get_shared_topics_count(doc1, doc2) >= 3 

510 

511 def _has_similar_challenges(self, doc1: SearchResult, doc2: SearchResult) -> bool: 

512 """Identify common challenge patterns (auth, scalability, compliance).""" 

513 challenge_patterns = [ 

514 ["authentication", "login", "auth", "signin"], 

515 ["scalability", "performance", "optimization", "scale"], 

516 ["compliance", "regulation", "audit", "governance"], 

517 ["integration", "api", "interface", "connection"], 

518 ["security", "privacy", "protection", "safety"], 

519 ["migration", "upgrade", "transition", "conversion"], 

520 ] 

521 

522 title1 = doc1.source_title.lower() 

523 title2 = doc2.source_title.lower() 

524 

525 for pattern in challenge_patterns: 

526 if any(keyword in title1 for keyword in pattern) and any( 

527 keyword in title2 for keyword in pattern 

528 ): 

529 return True 

530 

531 return False 

532 

533 def _has_transferable_domain_knowledge( 

534 self, doc1: SearchResult, doc2: SearchResult 

535 ) -> bool: 

536 """Check for transferable domain expertise between projects (delegates to CDI helper).""" 

537 return cdi_has_transferable_domain_knowledge(doc1, doc2) 

538 

539 def _has_reusable_architecture_patterns( 

540 self, doc1: SearchResult, doc2: SearchResult 

541 ) -> bool: 

542 """Identify reusable architecture patterns (delegates to CDI helper).""" 

543 return cdi_has_reusable_architecture_patterns(doc1, doc2) 

544 

545 def _has_shared_technologies(self, doc1: SearchResult, doc2: SearchResult) -> bool: 

546 """Identify shared technologies, frameworks, standards (delegates to CDI helper).""" 

547 return cdi_has_shared_technologies(doc1, doc2) 

548 

549 def _get_shared_technologies_count( 

550 self, doc1: SearchResult, doc2: SearchResult 

551 ) -> int: 

552 """Count shared technologies between documents (delegates to CDI helper).""" 

553 return cdi_get_shared_technologies_count(doc1, doc2) 

554 

555 def _enhanced_fallback_scoring( 

556 self, target_doc, candidate_doc 

557 ) -> tuple[float, str]: 

558 """Enhanced fallback when advanced algorithms don't apply.""" 

559 fallback_score = self._calculate_fallback_score(target_doc, candidate_doc) 

560 if fallback_score > 0: 

561 return fallback_score, "Basic content similarity" 

562 else: 

563 return 0.0, "No complementary relationship found" 

564 

565 def _calculate_fallback_score( 

566 self, target_doc: SearchResult, candidate_doc: SearchResult 

567 ) -> float: 

568 """Fallback scoring for when advanced methods don't find relationships.""" 

569 score = 0.0 

570 

571 # Just having any shared topics at all 

572 if self._has_shared_topics(target_doc, candidate_doc): 

573 shared_count = self._get_shared_topics_count(target_doc, candidate_doc) 

574 score = max(score, 0.2 + (shared_count * 0.05)) 

575 self.logger.debug( 

576 f"Fallback: {shared_count} shared topics → score: {score:.3f}" 

577 ) 

578 

579 # Just having any shared entities at all 

580 if self._has_shared_entities(target_doc, candidate_doc): 

581 shared_count = self._get_shared_entities_count(target_doc, candidate_doc) 

582 score = max(score, 0.15 + (shared_count * 0.05)) 

583 self.logger.debug( 

584 f"Fallback: {shared_count} shared entities → score: {score:.3f}" 

585 ) 

586 

587 # Simple keyword overlap in titles 

588 target_words = set(target_doc.source_title.lower().split()) 

589 candidate_words = set(candidate_doc.source_title.lower().split()) 

590 common_words = target_words & candidate_words 

591 if len(common_words) > 1: # More than just common words like "the", "and" 

592 score = max(score, 0.1 + (len(common_words) * 0.02)) 

593 self.logger.debug( 

594 f"Fallback: {len(common_words)} common words in titles → score: {score:.3f}" 

595 ) 

596 

597 return min(score, 0.5) # Cap fallback scores 

598 

599 def _has_shared_entities(self, doc1: SearchResult, doc2: SearchResult) -> bool: 

600 """Check if documents have shared entities (delegates to CDI helper).""" 

601 return cdi_has_shared_entities(doc1, doc2) 

602 

603 def _has_shared_topics(self, doc1: SearchResult, doc2: SearchResult) -> bool: 

604 """Check if documents have shared topics (delegates to CDI helper).""" 

605 return cdi_has_shared_topics(doc1, doc2) 

606 

607 def _get_shared_topics_count(self, doc1: SearchResult, doc2: SearchResult) -> int: 

608 """Get the count of shared topics (delegates to CDI helper).""" 

609 return cdi_get_shared_topics_count(doc1, doc2) 

610 

611 def _get_shared_entities_count(self, doc1: SearchResult, doc2: SearchResult) -> int: 

612 """Get the count of shared entities (delegates to CDI helper).""" 

613 return cdi_get_shared_entities_count(doc1, doc2) 

614 

615 def _has_different_content_complexity( 

616 self, doc1: SearchResult, doc2: SearchResult 

617 ) -> bool: 

618 """Check if documents have different levels of content complexity.""" 

619 # Compare word counts if available 

620 wc1 = int(getattr(doc1, "word_count", 0) or 0) 

621 wc2 = int(getattr(doc2, "word_count", 0) or 0) 

622 

623 # Guard against None or non-positive counts to avoid ZeroDivisionError 

624 if wc1 > 0 and wc2 > 0: 

625 ratio = max(wc1, wc2) / min(wc1, wc2) 

626 if ratio > 2.0: # One document is significantly longer 

627 return True 

628 

629 # Compare content features 

630 features1 = (doc1.has_code_blocks, doc1.has_tables, doc1.has_images) 

631 features2 = (doc2.has_code_blocks, doc2.has_tables, doc2.has_images) 

632 

633 # Different if one has technical content and the other doesn't 

634 return features1 != features2 

635 

636 def _get_complementary_content_type_score( 

637 self, target_doc: SearchResult, candidate_doc: SearchResult 

638 ) -> float: 

639 """Calculate score based on complementary content types.""" 

640 score = 0.0 

641 

642 # Technical + Business complement 

643 technical_keywords = [ 

644 "api", 

645 "code", 

646 "implementation", 

647 "technical", 

648 "development", 

649 "architecture", 

650 ] 

651 business_keywords = [ 

652 "requirements", 

653 "business", 

654 "specification", 

655 "user", 

656 "workflow", 

657 "process", 

658 ] 

659 

660 target_title = target_doc.source_title.lower() 

661 candidate_title = candidate_doc.source_title.lower() 

662 

663 target_is_technical = any( 

664 keyword in target_title for keyword in technical_keywords 

665 ) 

666 target_is_business = any( 

667 keyword in target_title for keyword in business_keywords 

668 ) 

669 candidate_is_technical = any( 

670 keyword in candidate_title for keyword in technical_keywords 

671 ) 

672 candidate_is_business = any( 

673 keyword in candidate_title for keyword in business_keywords 

674 ) 

675 

676 # Technical document + Business document = complementary 

677 if (target_is_technical and candidate_is_business) or ( 

678 target_is_business and candidate_is_technical 

679 ): 

680 score = max(score, 0.7) 

681 

682 # Documentation + Implementation complement 

683 if ( 

684 "documentation" in target_title and "implementation" in candidate_title 

685 ) or ("implementation" in target_title and "documentation" in candidate_title): 

686 score = max(score, 0.6) 

687 

688 # Tutorial + Reference complement 

689 tutorial_keywords = [ 

690 "tutorial", 

691 "guide", 

692 "how-to", 

693 "walkthrough", 

694 "quick start", 

695 ] 

696 reference_keywords = ["reference", "api", "specification", "manual", "docs"] 

697 target_is_tutorial = any(k in target_title for k in tutorial_keywords) 

698 target_is_reference = any(k in target_title for k in reference_keywords) 

699 candidate_is_tutorial = any(k in candidate_title for k in tutorial_keywords) 

700 candidate_is_reference = any(k in candidate_title for k in reference_keywords) 

701 if (target_is_tutorial and candidate_is_reference) or ( 

702 target_is_reference and candidate_is_tutorial 

703 ): 

704 score = max(score, 0.6) 

705 

706 # Requirements + Design complement 

707 if ( 

708 "requirements" in target_title 

709 and ("design" in candidate_title or "architecture" in candidate_title) 

710 ) or ( 

711 ("design" in target_title or "architecture" in target_title) 

712 and "requirements" in candidate_title 

713 ): 

714 score = max(score, 0.6) 

715 

716 return score