Coverage for src/qdrant_loader_mcp_server/search/enhanced/cdi/finders.py: 96%

1"""

2Complementary Content Discovery for Cross-Document Intelligence.

4This module implements advanced complementary content discovery that identifies

5documents which enhance understanding of a target document through

6requirements-implementation chains, abstraction gaps, and cross-functional relationships.

7"""

9from __future__ import annotations

11import time

12from typing import TYPE_CHECKING

14if TYPE_CHECKING:

15 pass

17from ....utils.logging import LoggingConfig

18from ...models import SearchResult

19from .extractors.similarity_helpers import (

20 get_shared_entities_count as cdi_get_shared_entities_count,

21)

22from .extractors.similarity_helpers import (

23 get_shared_technologies_count as cdi_get_shared_technologies_count,

24)

25from .extractors.similarity_helpers import (

26 get_shared_topics_count as cdi_get_shared_topics_count,

27)

28from .extractors.similarity_helpers import (

29 has_reusable_architecture_patterns as cdi_has_reusable_architecture_patterns,

30)

31from .extractors.similarity_helpers import (

32 has_shared_entities as cdi_has_shared_entities,

33)

34from .extractors.similarity_helpers import (

35 has_shared_technologies as cdi_has_shared_technologies,

36)

37from .extractors.similarity_helpers import has_shared_topics as cdi_has_shared_topics

38from .extractors.similarity_helpers import (

39 has_transferable_domain_knowledge as cdi_has_transferable_domain_knowledge,

40)

41from .models import ComplementaryContent

43logger = LoggingConfig.get_logger(__name__)

46class ComplementaryContentFinder:

47 """Finds complementary content that would enhance understanding of a target document."""

49 def __init__(

50 self,

51 similarity_calculator,

52 knowledge_graph=None,

53 ):

54 """Initialize the complementary content finder."""

55 self.similarity_calculator = similarity_calculator

56 self.knowledge_graph = knowledge_graph

57 self.logger = LoggingConfig.get_logger(__name__)

59 def find_complementary_content(

60 self,

61 target_doc,

62 candidate_docs,

63 max_recommendations: int = 5,

64 ) -> ComplementaryContent:

65 """Find complementary content for a target document."""

66 start_time = time.time()

68 recommendations = []

69 target_doc_id = f"{target_doc.source_type}:{target_doc.source_title}"

71 self.logger.info(f"Finding complementary content for target: {target_doc_id}")

72 self.logger.info(f"Target doc topics: {target_doc.topics}")

73 self.logger.info(f"Target doc entities: {target_doc.entities}")

74 self.logger.info(f"Analyzing {len(candidate_docs)} candidate documents")

76 for candidate in candidate_docs:

77 candidate_id = f"{candidate.source_type}:{candidate.source_title}"

79 if candidate_id == target_doc_id:

80 continue

82 # Consolidated candidate analysis debug (reduces verbosity)

83 self.logger.debug(

84 "Analyzing candidate",

85 candidate_id=candidate_id,

86 topics_count=len(candidate.topics),

87 entities_count=len(candidate.entities),

88 )

90 # Calculate complementary score

91 complementary_score, reason = self._calculate_complementary_score(

92 target_doc, candidate

93 )

95 self.logger.info(

96 f"Complementary score for {candidate_id}: {complementary_score:.3f} - {reason}"

97 )

99 if (

100 complementary_score > 0.15

101 ): # Lowered threshold for complementary content

102 recommendations.append((candidate_id, complementary_score, reason))

103 else:

104 # Log why it didn't make the cut

105 self.logger.debug(

106 f"Rejected {candidate_id}: score {complementary_score:.3f} below threshold 0.15"

107 )

108

109 # Sort by complementary score

110 recommendations.sort(key=lambda x: x[1], reverse=True)

111

112 processing_time = (time.time() - start_time) * 1000

113 self.logger.info(

114 f"Found {len(recommendations)} complementary recommendations in {processing_time:.2f}ms"

115 )

116

117 return ComplementaryContent(

118 target_doc_id=target_doc_id,

119 recommendations=recommendations[:max_recommendations],

120 recommendation_strategy="mixed",

121 )

122

123 def _calculate_complementary_score(

124 self, target_doc, candidate_doc

125 ) -> tuple[float, str]:

126 """Calculate how complementary a candidate document is to the target.

127

128 Redesigned algorithm that prioritizes intra-project relationships while

129 maintaining intelligent inter-project discovery capabilities.

130 """

131 self.logger.info(

132 f"=== Scoring {candidate_doc.source_title} against {target_doc.source_title} ==="

133 )

134

135 same_project = target_doc.project_id == candidate_doc.project_id

136 self.logger.info(

137 f"Project context: target={target_doc.project_id}, candidate={candidate_doc.project_id}, same_project={same_project}"

138 )

139

140 if same_project:

141 # Prioritize intra-project relationships

142 score, reason = self._score_intra_project_complementary(

143 target_doc, candidate_doc

144 )

145

146 # Boost for high topic relevance within project

147 if score > 0 and self._has_high_topic_overlap(target_doc, candidate_doc):

148 boosted_score = min(0.95, score * 1.2)

149 self.logger.info(

150 f"✓ Intra-project topic boost: {score:.3f} → {boosted_score:.3f}"

151 )

152 score = boosted_score

153 reason = f"{reason} (high topic relevance)"

154

155 else:

156 # Evaluate inter-project relationships

157 score, reason = self._score_inter_project_complementary(

158 target_doc, candidate_doc

159 )

160

161 # Apply cross-project penalty (inter-project content is less immediately useful)

162 if score > 0:

163 adjusted_score = score * 0.8

164 self.logger.info(

165 f"✓ Inter-project penalty applied: {score:.3f} → {adjusted_score:.3f}"

166 )

167 score = adjusted_score

168 reason = f"Inter-project: {reason}"

169

170 self.logger.info(

171 f"Final complementary score: {score:.3f} for {candidate_doc.source_title} - {reason}"

172 )

173 return score, reason

174

175 def _score_intra_project_complementary(

176 self, target_doc, candidate_doc

177 ) -> tuple[float, str]:

178 """Score complementary relationships within the same project."""

179 factors = []

180

181 # A. Requirements ↔ Implementation Chain

182 if self._is_requirements_implementation_pair(target_doc, candidate_doc):

183 factors.append((0.85, "requirements-implementation"))

184 self.logger.info("✓ Found requirements-implementation pair")

185

186 # B. Abstraction Level Differences

187 abstraction_gap = self._calculate_abstraction_gap(target_doc, candidate_doc)

188 if abstraction_gap > 0:

189 score = 0.7 + (abstraction_gap * 0.1)

190 factors.append(

191 (score, f"Different abstraction levels (gap: {abstraction_gap})")

192 )

193 self.logger.info(

194 f"✓ Abstraction gap: {abstraction_gap} → score: {score:.3f}"

195 )

196

197 # C. Cross-Functional Perspectives

198 if self._has_cross_functional_relationship(target_doc, candidate_doc):

199 factors.append((0.75, "Cross-functional perspectives"))

200 self.logger.info("✓ Cross-functional relationship detected")

201

202 # D. Topic Overlap with Different Document Types

203 if self._has_shared_topics(

204 target_doc, candidate_doc

205 ) and self._has_different_document_types(target_doc, candidate_doc):

206 shared_topics = self._get_shared_topics_count(target_doc, candidate_doc)

207 score = min(0.65, 0.35 + (shared_topics * 0.1))

208 factors.append(

209 (

210 score,

211 f"Same topics, different document types ({shared_topics} topics)",

212 )

213 )

214 self.logger.info(f"✓ Topic overlap with different doc types: {score:.3f}")

215

216 return self._calculate_weighted_score(factors, target_doc, candidate_doc)

217

218 def _score_inter_project_complementary(

219 self, target_doc, candidate_doc

220 ) -> tuple[float, str]:

221 """Score complementary relationships between different projects."""

222 factors = []

223

224 # A. Similar Challenges/Solutions

225 if self._has_similar_challenges(target_doc, candidate_doc):

226 factors.append((0.8, "Similar challenges/solutions"))

227 self.logger.info("✓ Similar challenges detected")

228

229 # B. Domain Expertise Transfer

230 if self._has_transferable_domain_knowledge(target_doc, candidate_doc):

231 factors.append((0.75, "Transferable domain knowledge"))

232 self.logger.info("✓ Transferable domain knowledge")

233

234 # C. Architectural Patterns

235 if self._has_reusable_architecture_patterns(target_doc, candidate_doc):

236 factors.append((0.7, "Reusable architecture patterns"))

237 self.logger.info("✓ Architecture patterns detected")

238

239 # D. Shared Technologies/Standards

240 if self._has_shared_technologies(target_doc, candidate_doc):

241 shared_count = self._get_shared_technologies_count(

242 target_doc, candidate_doc

243 )

244 score = min(0.6, 0.3 + (shared_count * 0.1))

245 factors.append((score, f"Shared technologies ({shared_count} common)"))

246 self.logger.info(f"✓ Shared technologies: {score:.3f}")

247

248 return self._calculate_weighted_score(factors, target_doc, candidate_doc)

249

250 def _calculate_weighted_score(

251 self,

252 factors: list[tuple[float, str]],

253 target_doc=None,

254 candidate_doc=None,

255 ) -> tuple[float, str]:

256 """Calculate weighted score from multiple factors."""

257 if not factors:

258 if target_doc and candidate_doc:

259 return self._enhanced_fallback_scoring(target_doc, candidate_doc)

260 else:

261 return 0.0, "No complementary relationship found"

262

263 # Sort factors by score but give priority to requirements-implementation relationships

264 factors.sort(key=lambda x: x[0], reverse=True)

265

266 # Check for high-priority relationships first

267 for score, reason in factors:

268 if "requirements-implementation" in reason.lower():

269 # Requirements-implementation pairs get priority

270 if len(factors) > 1:

271 secondary_boost = sum(s for s, r in factors if r != reason) * 0.1

272 final_score = min(0.95, score + secondary_boost)

273 primary_reason = f"{reason} (+{len(factors)-1} other factors)"

274 else:

275 final_score = score

276 primary_reason = reason

277 return final_score, primary_reason

278

279 # Use the highest scoring factor as primary

280 primary_score, primary_reason = factors[0]

281

282 # Boost if multiple factors contribute

283 if len(factors) > 1:

284 secondary_boost = sum(score for score, _ in factors[1:]) * 0.1

285 final_score = min(0.95, primary_score + secondary_boost)

286 primary_reason = f"{primary_reason} (+{len(factors)-1} other factors)"

287 else:

288 final_score = primary_score

289

290 return final_score, primary_reason

291

292 def _is_requirements_implementation_pair(self, doc1, doc2) -> bool:

293 """Detect if documents form a requirements -> implementation chain."""

294 req_keywords = [

295 "requirements",

296 "specification",

297 "user story",

298 "feature",

299 "functional",

300 ]

301 impl_keywords = [

302 "implementation",

303 "technical",

304 "architecture",

305 "api",

306 "code",

307 "development",

308 ]

309

310 title1 = doc1.source_title.lower()

311 title2 = doc2.source_title.lower()

312

313 doc1_is_req = any(keyword in title1 for keyword in req_keywords)

314 doc1_is_impl = any(keyword in title1 for keyword in impl_keywords)

315 doc2_is_req = any(keyword in title2 for keyword in req_keywords)

316 doc2_is_impl = any(keyword in title2 for keyword in impl_keywords)

317

318 # One is requirements, other is implementation

319 is_req_impl_pair = (doc1_is_req and doc2_is_impl) or (

320 doc1_is_impl and doc2_is_req

321 )

322

323 if not is_req_impl_pair:

324 return False

325

326 # For same-project documents, we don't require shared topics/entities

327 # as the project context already provides relationship

328 same_project = (

329 getattr(doc1, "project_id", None) == getattr(doc2, "project_id", None)

330 and getattr(doc1, "project_id", None) is not None

331 )

332

333 if same_project:

334 return True

335

336 # For different projects, require some shared context

337 return self._has_shared_topics(doc1, doc2) or self._has_shared_entities(

338 doc1, doc2

339 )

340

341 def _calculate_abstraction_gap(self, doc1: SearchResult, doc2: SearchResult) -> int:

342 """Calculate difference in abstraction levels (0-3).

343 0: Same level, 3: Maximum gap (e.g., epic vs implementation detail)

344 """

345 level1 = self._get_abstraction_level(doc1)

346 level2 = self._get_abstraction_level(doc2)

347 return abs(level1 - level2)

348

349 def _get_abstraction_level(self, doc: SearchResult) -> int:

350 """Determine abstraction level of document (0=highest, 3=lowest)."""

351 title = doc.source_title.lower()

352

353 # Level 0: High-level business/strategy

354 if any(

355 keyword in title

356 for keyword in [

357 "strategy",

358 "vision",

359 "overview",

360 "executive",

361 "business case",

362 ]

363 ):

364 return 0

365

366 # Level 1: Requirements/features

367 if any(

368 keyword in title

369 for keyword in [

370 "requirements",

371 "features",

372 "user story",

373 "epic",

374 "specification",

375 ]

376 ):

377 return 1

378

379 # Level 2: Design/architecture

380 if any(

381 keyword in title

382 for keyword in [

383 "design",

384 "architecture",

385 "workflow",

386 "process",

387 "wireframe",

388 ]

389 ):

390 return 2

391

392 # Level 3: Implementation details

393 if any(

394 keyword in title

395 for keyword in [

396 "implementation",

397 "code",

398 "api",

399 "technical",

400 "development",

401 "configuration",

402 ]

403 ):

404 return 3

405

406 # Default to middle level

407 return 2

408

409 def _has_cross_functional_relationship(

410 self, doc1: SearchResult, doc2: SearchResult

411 ) -> bool:

412 """Detect business + technical, feature + security, etc."""

413 business_keywords = [

414 "business",

415 "user",

416 "requirements",

417 "workflow",

418 "process",

419 "feature",

420 ]

421 technical_keywords = [

422 "technical",

423 "architecture",

424 "api",

425 "implementation",

426 "code",

427 "development",

428 ]

429 security_keywords = [

430 "security",

431 "authentication",

432 "authorization",

433 "compliance",

434 "audit",

435 ]

436

437 title1 = doc1.source_title.lower()

438 title2 = doc2.source_title.lower()

439

440 # Business + Technical

441 if (

442 any(k in title1 for k in business_keywords)

443 and any(k in title2 for k in technical_keywords)

444 ) or (

445 any(k in title2 for k in business_keywords)

446 and any(k in title1 for k in technical_keywords)

447 ):

448 return True

449

450 # Feature + Security

451 if (

452 any(k in title1 for k in ["feature", "functionality"])

453 and any(k in title2 for k in security_keywords)

454 ) or (

455 any(k in title2 for k in ["feature", "functionality"])

456 and any(k in title1 for k in security_keywords)

457 ):

458 return True

459

460 return False

461

462 def _has_different_document_types(self, doc1, doc2) -> bool:

463 """Check if documents are of different types based on content and title."""

464 type1 = self._classify_document_type(doc1)

465 type2 = self._classify_document_type(doc2)

466 return type1 != type2

467

468 def _classify_document_type(self, doc) -> str:

469 """Classify document as: user_story, technical_spec, architecture, compliance, testing, etc."""

470 title = doc.source_title.lower()

471

472 # Check more specific categories first to avoid conflicts

473 if any(

474 keyword in title

475 for keyword in ["security", "compliance", "audit", "policy"]

476 ):

477 return "compliance"

478 elif any(keyword in title for keyword in ["test", "testing", "qa", "quality"]):

479 return "testing"

480 elif any(keyword in title for keyword in ["tutorial", "how-to", "walkthrough"]):

481 return "tutorial"

482 elif any(keyword in title for keyword in ["reference", "manual"]):

483 return "reference"

484 elif any(keyword in title for keyword in ["example", "sample", "demo"]):

485 return "example"

486 elif any(keyword in title for keyword in ["user story", "epic", "feature"]):

487 return "user_story"

488 elif any(

489 keyword in title

490 for keyword in ["technical", "specification", "api", "implementation"]

491 ):

492 return "technical_spec"

493 elif any(keyword in title for keyword in ["architecture", "design", "system"]):

494 return "architecture"

495 elif any(

496 keyword in title

497 for keyword in ["workflow", "process", "procedure", "guide"]

498 ):

499 return "process"

500 elif any(

501 keyword in title for keyword in ["requirement"]

502 ): # More general, check last

503 return "user_story"

504 else:

505 return "general"

506

507 def _has_high_topic_overlap(self, doc1: SearchResult, doc2: SearchResult) -> bool:

508 """Check if documents have high topic overlap (>= 3 shared topics)."""

509 return self._get_shared_topics_count(doc1, doc2) >= 3

510

511 def _has_similar_challenges(self, doc1: SearchResult, doc2: SearchResult) -> bool:

512 """Identify common challenge patterns (auth, scalability, compliance)."""

513 challenge_patterns = [

514 ["authentication", "login", "auth", "signin"],

515 ["scalability", "performance", "optimization", "scale"],

516 ["compliance", "regulation", "audit", "governance"],

517 ["integration", "api", "interface", "connection"],

518 ["security", "privacy", "protection", "safety"],

519 ["migration", "upgrade", "transition", "conversion"],

520 ]

521

522 title1 = doc1.source_title.lower()

523 title2 = doc2.source_title.lower()

524

525 for pattern in challenge_patterns:

526 if any(keyword in title1 for keyword in pattern) and any(

527 keyword in title2 for keyword in pattern

528 ):

529 return True

530

531 return False

532

533 def _has_transferable_domain_knowledge(

534 self, doc1: SearchResult, doc2: SearchResult

535 ) -> bool:

536 """Check for transferable domain expertise between projects (delegates to CDI helper)."""

537 return cdi_has_transferable_domain_knowledge(doc1, doc2)

538

539 def _has_reusable_architecture_patterns(

540 self, doc1: SearchResult, doc2: SearchResult

541 ) -> bool:

542 """Identify reusable architecture patterns (delegates to CDI helper)."""

543 return cdi_has_reusable_architecture_patterns(doc1, doc2)

544

545 def _has_shared_technologies(self, doc1: SearchResult, doc2: SearchResult) -> bool:

546 """Identify shared technologies, frameworks, standards (delegates to CDI helper)."""

547 return cdi_has_shared_technologies(doc1, doc2)

548

549 def _get_shared_technologies_count(

550 self, doc1: SearchResult, doc2: SearchResult

551 ) -> int:

552 """Count shared technologies between documents (delegates to CDI helper)."""

553 return cdi_get_shared_technologies_count(doc1, doc2)

554

555 def _enhanced_fallback_scoring(

556 self, target_doc, candidate_doc

557 ) -> tuple[float, str]:

558 """Enhanced fallback when advanced algorithms don't apply."""

559 fallback_score = self._calculate_fallback_score(target_doc, candidate_doc)

560 if fallback_score > 0:

561 return fallback_score, "Basic content similarity"

562 else:

563 return 0.0, "No complementary relationship found"

564

565 def _calculate_fallback_score(

566 self, target_doc: SearchResult, candidate_doc: SearchResult

567 ) -> float:

568 """Fallback scoring for when advanced methods don't find relationships."""

569 score = 0.0

570

571 # Just having any shared topics at all

572 if self._has_shared_topics(target_doc, candidate_doc):

573 shared_count = self._get_shared_topics_count(target_doc, candidate_doc)

574 score = max(score, 0.2 + (shared_count * 0.05))

575 self.logger.debug(

576 f"Fallback: {shared_count} shared topics → score: {score:.3f}"

577 )

578

579 # Just having any shared entities at all

580 if self._has_shared_entities(target_doc, candidate_doc):

581 shared_count = self._get_shared_entities_count(target_doc, candidate_doc)

582 score = max(score, 0.15 + (shared_count * 0.05))

583 self.logger.debug(

584 f"Fallback: {shared_count} shared entities → score: {score:.3f}"

585 )

586

587 # Simple keyword overlap in titles

588 target_words = set(target_doc.source_title.lower().split())

589 candidate_words = set(candidate_doc.source_title.lower().split())

590 common_words = target_words & candidate_words

591 if len(common_words) > 1: # More than just common words like "the", "and"

592 score = max(score, 0.1 + (len(common_words) * 0.02))

593 self.logger.debug(

594 f"Fallback: {len(common_words)} common words in titles → score: {score:.3f}"

595 )

596

597 return min(score, 0.5) # Cap fallback scores

598

599 def _has_shared_entities(self, doc1: SearchResult, doc2: SearchResult) -> bool:

600 """Check if documents have shared entities (delegates to CDI helper)."""

601 return cdi_has_shared_entities(doc1, doc2)

602

603 def _has_shared_topics(self, doc1: SearchResult, doc2: SearchResult) -> bool:

604 """Check if documents have shared topics (delegates to CDI helper)."""

605 return cdi_has_shared_topics(doc1, doc2)

606

607 def _get_shared_topics_count(self, doc1: SearchResult, doc2: SearchResult) -> int:

608 """Get the count of shared topics (delegates to CDI helper)."""

609 return cdi_get_shared_topics_count(doc1, doc2)

610

611 def _get_shared_entities_count(self, doc1: SearchResult, doc2: SearchResult) -> int:

612 """Get the count of shared entities (delegates to CDI helper)."""

613 return cdi_get_shared_entities_count(doc1, doc2)

614

615 def _has_different_content_complexity(

616 self, doc1: SearchResult, doc2: SearchResult

617 ) -> bool:

618 """Check if documents have different levels of content complexity."""

619 # Compare word counts if available

620 wc1 = int(getattr(doc1, "word_count", 0) or 0)

621 wc2 = int(getattr(doc2, "word_count", 0) or 0)

622

623 # Guard against None or non-positive counts to avoid ZeroDivisionError

624 if wc1 > 0 and wc2 > 0:

625 ratio = max(wc1, wc2) / min(wc1, wc2)

626 if ratio > 2.0: # One document is significantly longer

627 return True

628

629 # Compare content features

630 features1 = (doc1.has_code_blocks, doc1.has_tables, doc1.has_images)

631 features2 = (doc2.has_code_blocks, doc2.has_tables, doc2.has_images)

632

633 # Different if one has technical content and the other doesn't

634 return features1 != features2

635

636 def _get_complementary_content_type_score(

637 self, target_doc: SearchResult, candidate_doc: SearchResult

638 ) -> float:

639 """Calculate score based on complementary content types."""

640 score = 0.0

641

642 # Technical + Business complement

643 technical_keywords = [

644 "api",

645 "code",

646 "implementation",

647 "technical",

648 "development",

649 "architecture",

650 ]

651 business_keywords = [

652 "requirements",

653 "business",

654 "specification",

655 "user",

656 "workflow",

657 "process",

658 ]

659

660 target_title = target_doc.source_title.lower()

661 candidate_title = candidate_doc.source_title.lower()

662

663 target_is_technical = any(

664 keyword in target_title for keyword in technical_keywords

665 )

666 target_is_business = any(

667 keyword in target_title for keyword in business_keywords

668 )

669 candidate_is_technical = any(

670 keyword in candidate_title for keyword in technical_keywords

671 )

672 candidate_is_business = any(

673 keyword in candidate_title for keyword in business_keywords

674 )

675

676 # Technical document + Business document = complementary

677 if (target_is_technical and candidate_is_business) or (

678 target_is_business and candidate_is_technical

679 ):

680 score = max(score, 0.7)

681

682 # Documentation + Implementation complement

683 if (

684 "documentation" in target_title and "implementation" in candidate_title

685 ) or ("implementation" in target_title and "documentation" in candidate_title):

686 score = max(score, 0.6)

687

688 # Tutorial + Reference complement

689 tutorial_keywords = [

690 "tutorial",

691 "guide",

692 "how-to",

693 "walkthrough",

694 "quick start",

695 ]

696 reference_keywords = ["reference", "api", "specification", "manual", "docs"]

697 target_is_tutorial = any(k in target_title for k in tutorial_keywords)

698 target_is_reference = any(k in target_title for k in reference_keywords)

699 candidate_is_tutorial = any(k in candidate_title for k in tutorial_keywords)

700 candidate_is_reference = any(k in candidate_title for k in reference_keywords)

701 if (target_is_tutorial and candidate_is_reference) or (

702 target_is_reference and candidate_is_tutorial

703 ):

704 score = max(score, 0.6)

705

706 # Requirements + Design complement

707 if (

708 "requirements" in target_title

709 and ("design" in candidate_title or "architecture" in candidate_title)

710 ) or (

711 ("design" in target_title or "architecture" in target_title)

712 and "requirements" in candidate_title

713 ):

714 score = max(score, 0.6)

715

716 return score