Coverage for src/qdrant_loader_mcp_server/search/enhanced/cdi/finders.py: 96%
271 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""
2Complementary Content Discovery for Cross-Document Intelligence.
4This module implements advanced complementary content discovery that identifies
5documents which enhance understanding of a target document through
6requirements-implementation chains, abstraction gaps, and cross-functional relationships.
7"""
9from __future__ import annotations
11import time
12from typing import TYPE_CHECKING
14if TYPE_CHECKING:
15 pass
17from ....utils.logging import LoggingConfig
18from ...models import SearchResult
19from .extractors.similarity_helpers import (
20 get_shared_entities_count as cdi_get_shared_entities_count,
21)
22from .extractors.similarity_helpers import (
23 get_shared_technologies_count as cdi_get_shared_technologies_count,
24)
25from .extractors.similarity_helpers import (
26 get_shared_topics_count as cdi_get_shared_topics_count,
27)
28from .extractors.similarity_helpers import (
29 has_reusable_architecture_patterns as cdi_has_reusable_architecture_patterns,
30)
31from .extractors.similarity_helpers import (
32 has_shared_entities as cdi_has_shared_entities,
33)
34from .extractors.similarity_helpers import (
35 has_shared_technologies as cdi_has_shared_technologies,
36)
37from .extractors.similarity_helpers import has_shared_topics as cdi_has_shared_topics
38from .extractors.similarity_helpers import (
39 has_transferable_domain_knowledge as cdi_has_transferable_domain_knowledge,
40)
41from .models import ComplementaryContent
43logger = LoggingConfig.get_logger(__name__)
46class ComplementaryContentFinder:
47 """Finds complementary content that would enhance understanding of a target document."""
49 def __init__(
50 self,
51 similarity_calculator,
52 knowledge_graph=None,
53 ):
54 """Initialize the complementary content finder."""
55 self.similarity_calculator = similarity_calculator
56 self.knowledge_graph = knowledge_graph
57 self.logger = LoggingConfig.get_logger(__name__)
59 def find_complementary_content(
60 self,
61 target_doc,
62 candidate_docs,
63 max_recommendations: int = 5,
64 ) -> ComplementaryContent:
65 """Find complementary content for a target document."""
66 start_time = time.time()
68 recommendations = []
69 target_doc_id = f"{target_doc.source_type}:{target_doc.source_title}"
71 self.logger.info(f"Finding complementary content for target: {target_doc_id}")
72 self.logger.info(f"Target doc topics: {target_doc.topics}")
73 self.logger.info(f"Target doc entities: {target_doc.entities}")
74 self.logger.info(f"Analyzing {len(candidate_docs)} candidate documents")
76 for candidate in candidate_docs:
77 candidate_id = f"{candidate.source_type}:{candidate.source_title}"
79 if candidate_id == target_doc_id:
80 continue
82 # Consolidated candidate analysis debug (reduces verbosity)
83 self.logger.debug(
84 "Analyzing candidate",
85 candidate_id=candidate_id,
86 topics_count=len(candidate.topics),
87 entities_count=len(candidate.entities),
88 )
90 # Calculate complementary score
91 complementary_score, reason = self._calculate_complementary_score(
92 target_doc, candidate
93 )
95 self.logger.info(
96 f"Complementary score for {candidate_id}: {complementary_score:.3f} - {reason}"
97 )
99 if (
100 complementary_score > 0.15
101 ): # Lowered threshold for complementary content
102 recommendations.append((candidate_id, complementary_score, reason))
103 else:
104 # Log why it didn't make the cut
105 self.logger.debug(
106 f"Rejected {candidate_id}: score {complementary_score:.3f} below threshold 0.15"
107 )
109 # Sort by complementary score
110 recommendations.sort(key=lambda x: x[1], reverse=True)
112 processing_time = (time.time() - start_time) * 1000
113 self.logger.info(
114 f"Found {len(recommendations)} complementary recommendations in {processing_time:.2f}ms"
115 )
117 return ComplementaryContent(
118 target_doc_id=target_doc_id,
119 recommendations=recommendations[:max_recommendations],
120 recommendation_strategy="mixed",
121 )
123 def _calculate_complementary_score(
124 self, target_doc, candidate_doc
125 ) -> tuple[float, str]:
126 """Calculate how complementary a candidate document is to the target.
128 Redesigned algorithm that prioritizes intra-project relationships while
129 maintaining intelligent inter-project discovery capabilities.
130 """
131 self.logger.info(
132 f"=== Scoring {candidate_doc.source_title} against {target_doc.source_title} ==="
133 )
135 same_project = target_doc.project_id == candidate_doc.project_id
136 self.logger.info(
137 f"Project context: target={target_doc.project_id}, candidate={candidate_doc.project_id}, same_project={same_project}"
138 )
140 if same_project:
141 # Prioritize intra-project relationships
142 score, reason = self._score_intra_project_complementary(
143 target_doc, candidate_doc
144 )
146 # Boost for high topic relevance within project
147 if score > 0 and self._has_high_topic_overlap(target_doc, candidate_doc):
148 boosted_score = min(0.95, score * 1.2)
149 self.logger.info(
150 f"✓ Intra-project topic boost: {score:.3f} → {boosted_score:.3f}"
151 )
152 score = boosted_score
153 reason = f"{reason} (high topic relevance)"
155 else:
156 # Evaluate inter-project relationships
157 score, reason = self._score_inter_project_complementary(
158 target_doc, candidate_doc
159 )
161 # Apply cross-project penalty (inter-project content is less immediately useful)
162 if score > 0:
163 adjusted_score = score * 0.8
164 self.logger.info(
165 f"✓ Inter-project penalty applied: {score:.3f} → {adjusted_score:.3f}"
166 )
167 score = adjusted_score
168 reason = f"Inter-project: {reason}"
170 self.logger.info(
171 f"Final complementary score: {score:.3f} for {candidate_doc.source_title} - {reason}"
172 )
173 return score, reason
175 def _score_intra_project_complementary(
176 self, target_doc, candidate_doc
177 ) -> tuple[float, str]:
178 """Score complementary relationships within the same project."""
179 factors = []
181 # A. Requirements ↔ Implementation Chain
182 if self._is_requirements_implementation_pair(target_doc, candidate_doc):
183 factors.append((0.85, "requirements-implementation"))
184 self.logger.info("✓ Found requirements-implementation pair")
186 # B. Abstraction Level Differences
187 abstraction_gap = self._calculate_abstraction_gap(target_doc, candidate_doc)
188 if abstraction_gap > 0:
189 score = 0.7 + (abstraction_gap * 0.1)
190 factors.append(
191 (score, f"Different abstraction levels (gap: {abstraction_gap})")
192 )
193 self.logger.info(
194 f"✓ Abstraction gap: {abstraction_gap} → score: {score:.3f}"
195 )
197 # C. Cross-Functional Perspectives
198 if self._has_cross_functional_relationship(target_doc, candidate_doc):
199 factors.append((0.75, "Cross-functional perspectives"))
200 self.logger.info("✓ Cross-functional relationship detected")
202 # D. Topic Overlap with Different Document Types
203 if self._has_shared_topics(
204 target_doc, candidate_doc
205 ) and self._has_different_document_types(target_doc, candidate_doc):
206 shared_topics = self._get_shared_topics_count(target_doc, candidate_doc)
207 score = min(0.65, 0.35 + (shared_topics * 0.1))
208 factors.append(
209 (
210 score,
211 f"Same topics, different document types ({shared_topics} topics)",
212 )
213 )
214 self.logger.info(f"✓ Topic overlap with different doc types: {score:.3f}")
216 return self._calculate_weighted_score(factors, target_doc, candidate_doc)
218 def _score_inter_project_complementary(
219 self, target_doc, candidate_doc
220 ) -> tuple[float, str]:
221 """Score complementary relationships between different projects."""
222 factors = []
224 # A. Similar Challenges/Solutions
225 if self._has_similar_challenges(target_doc, candidate_doc):
226 factors.append((0.8, "Similar challenges/solutions"))
227 self.logger.info("✓ Similar challenges detected")
229 # B. Domain Expertise Transfer
230 if self._has_transferable_domain_knowledge(target_doc, candidate_doc):
231 factors.append((0.75, "Transferable domain knowledge"))
232 self.logger.info("✓ Transferable domain knowledge")
234 # C. Architectural Patterns
235 if self._has_reusable_architecture_patterns(target_doc, candidate_doc):
236 factors.append((0.7, "Reusable architecture patterns"))
237 self.logger.info("✓ Architecture patterns detected")
239 # D. Shared Technologies/Standards
240 if self._has_shared_technologies(target_doc, candidate_doc):
241 shared_count = self._get_shared_technologies_count(
242 target_doc, candidate_doc
243 )
244 score = min(0.6, 0.3 + (shared_count * 0.1))
245 factors.append((score, f"Shared technologies ({shared_count} common)"))
246 self.logger.info(f"✓ Shared technologies: {score:.3f}")
248 return self._calculate_weighted_score(factors, target_doc, candidate_doc)
250 def _calculate_weighted_score(
251 self,
252 factors: list[tuple[float, str]],
253 target_doc=None,
254 candidate_doc=None,
255 ) -> tuple[float, str]:
256 """Calculate weighted score from multiple factors."""
257 if not factors:
258 if target_doc and candidate_doc:
259 return self._enhanced_fallback_scoring(target_doc, candidate_doc)
260 else:
261 return 0.0, "No complementary relationship found"
263 # Sort factors by score but give priority to requirements-implementation relationships
264 factors.sort(key=lambda x: x[0], reverse=True)
266 # Check for high-priority relationships first
267 for score, reason in factors:
268 if "requirements-implementation" in reason.lower():
269 # Requirements-implementation pairs get priority
270 if len(factors) > 1:
271 secondary_boost = sum(s for s, r in factors if r != reason) * 0.1
272 final_score = min(0.95, score + secondary_boost)
273 primary_reason = f"{reason} (+{len(factors)-1} other factors)"
274 else:
275 final_score = score
276 primary_reason = reason
277 return final_score, primary_reason
279 # Use the highest scoring factor as primary
280 primary_score, primary_reason = factors[0]
282 # Boost if multiple factors contribute
283 if len(factors) > 1:
284 secondary_boost = sum(score for score, _ in factors[1:]) * 0.1
285 final_score = min(0.95, primary_score + secondary_boost)
286 primary_reason = f"{primary_reason} (+{len(factors)-1} other factors)"
287 else:
288 final_score = primary_score
290 return final_score, primary_reason
292 def _is_requirements_implementation_pair(self, doc1, doc2) -> bool:
293 """Detect if documents form a requirements -> implementation chain."""
294 req_keywords = [
295 "requirements",
296 "specification",
297 "user story",
298 "feature",
299 "functional",
300 ]
301 impl_keywords = [
302 "implementation",
303 "technical",
304 "architecture",
305 "api",
306 "code",
307 "development",
308 ]
310 title1 = doc1.source_title.lower()
311 title2 = doc2.source_title.lower()
313 doc1_is_req = any(keyword in title1 for keyword in req_keywords)
314 doc1_is_impl = any(keyword in title1 for keyword in impl_keywords)
315 doc2_is_req = any(keyword in title2 for keyword in req_keywords)
316 doc2_is_impl = any(keyword in title2 for keyword in impl_keywords)
318 # One is requirements, other is implementation
319 is_req_impl_pair = (doc1_is_req and doc2_is_impl) or (
320 doc1_is_impl and doc2_is_req
321 )
323 if not is_req_impl_pair:
324 return False
326 # For same-project documents, we don't require shared topics/entities
327 # as the project context already provides relationship
328 same_project = (
329 getattr(doc1, "project_id", None) == getattr(doc2, "project_id", None)
330 and getattr(doc1, "project_id", None) is not None
331 )
333 if same_project:
334 return True
336 # For different projects, require some shared context
337 return self._has_shared_topics(doc1, doc2) or self._has_shared_entities(
338 doc1, doc2
339 )
341 def _calculate_abstraction_gap(self, doc1: SearchResult, doc2: SearchResult) -> int:
342 """Calculate difference in abstraction levels (0-3).
343 0: Same level, 3: Maximum gap (e.g., epic vs implementation detail)
344 """
345 level1 = self._get_abstraction_level(doc1)
346 level2 = self._get_abstraction_level(doc2)
347 return abs(level1 - level2)
349 def _get_abstraction_level(self, doc: SearchResult) -> int:
350 """Determine abstraction level of document (0=highest, 3=lowest)."""
351 title = doc.source_title.lower()
353 # Level 0: High-level business/strategy
354 if any(
355 keyword in title
356 for keyword in [
357 "strategy",
358 "vision",
359 "overview",
360 "executive",
361 "business case",
362 ]
363 ):
364 return 0
366 # Level 1: Requirements/features
367 if any(
368 keyword in title
369 for keyword in [
370 "requirements",
371 "features",
372 "user story",
373 "epic",
374 "specification",
375 ]
376 ):
377 return 1
379 # Level 2: Design/architecture
380 if any(
381 keyword in title
382 for keyword in [
383 "design",
384 "architecture",
385 "workflow",
386 "process",
387 "wireframe",
388 ]
389 ):
390 return 2
392 # Level 3: Implementation details
393 if any(
394 keyword in title
395 for keyword in [
396 "implementation",
397 "code",
398 "api",
399 "technical",
400 "development",
401 "configuration",
402 ]
403 ):
404 return 3
406 # Default to middle level
407 return 2
409 def _has_cross_functional_relationship(
410 self, doc1: SearchResult, doc2: SearchResult
411 ) -> bool:
412 """Detect business + technical, feature + security, etc."""
413 business_keywords = [
414 "business",
415 "user",
416 "requirements",
417 "workflow",
418 "process",
419 "feature",
420 ]
421 technical_keywords = [
422 "technical",
423 "architecture",
424 "api",
425 "implementation",
426 "code",
427 "development",
428 ]
429 security_keywords = [
430 "security",
431 "authentication",
432 "authorization",
433 "compliance",
434 "audit",
435 ]
437 title1 = doc1.source_title.lower()
438 title2 = doc2.source_title.lower()
440 # Business + Technical
441 if (
442 any(k in title1 for k in business_keywords)
443 and any(k in title2 for k in technical_keywords)
444 ) or (
445 any(k in title2 for k in business_keywords)
446 and any(k in title1 for k in technical_keywords)
447 ):
448 return True
450 # Feature + Security
451 if (
452 any(k in title1 for k in ["feature", "functionality"])
453 and any(k in title2 for k in security_keywords)
454 ) or (
455 any(k in title2 for k in ["feature", "functionality"])
456 and any(k in title1 for k in security_keywords)
457 ):
458 return True
460 return False
462 def _has_different_document_types(self, doc1, doc2) -> bool:
463 """Check if documents are of different types based on content and title."""
464 type1 = self._classify_document_type(doc1)
465 type2 = self._classify_document_type(doc2)
466 return type1 != type2
468 def _classify_document_type(self, doc) -> str:
469 """Classify document as: user_story, technical_spec, architecture, compliance, testing, etc."""
470 title = doc.source_title.lower()
472 # Check more specific categories first to avoid conflicts
473 if any(
474 keyword in title
475 for keyword in ["security", "compliance", "audit", "policy"]
476 ):
477 return "compliance"
478 elif any(keyword in title for keyword in ["test", "testing", "qa", "quality"]):
479 return "testing"
480 elif any(keyword in title for keyword in ["tutorial", "how-to", "walkthrough"]):
481 return "tutorial"
482 elif any(keyword in title for keyword in ["reference", "manual"]):
483 return "reference"
484 elif any(keyword in title for keyword in ["example", "sample", "demo"]):
485 return "example"
486 elif any(keyword in title for keyword in ["user story", "epic", "feature"]):
487 return "user_story"
488 elif any(
489 keyword in title
490 for keyword in ["technical", "specification", "api", "implementation"]
491 ):
492 return "technical_spec"
493 elif any(keyword in title for keyword in ["architecture", "design", "system"]):
494 return "architecture"
495 elif any(
496 keyword in title
497 for keyword in ["workflow", "process", "procedure", "guide"]
498 ):
499 return "process"
500 elif any(
501 keyword in title for keyword in ["requirement"]
502 ): # More general, check last
503 return "user_story"
504 else:
505 return "general"
507 def _has_high_topic_overlap(self, doc1: SearchResult, doc2: SearchResult) -> bool:
508 """Check if documents have high topic overlap (>= 3 shared topics)."""
509 return self._get_shared_topics_count(doc1, doc2) >= 3
511 def _has_similar_challenges(self, doc1: SearchResult, doc2: SearchResult) -> bool:
512 """Identify common challenge patterns (auth, scalability, compliance)."""
513 challenge_patterns = [
514 ["authentication", "login", "auth", "signin"],
515 ["scalability", "performance", "optimization", "scale"],
516 ["compliance", "regulation", "audit", "governance"],
517 ["integration", "api", "interface", "connection"],
518 ["security", "privacy", "protection", "safety"],
519 ["migration", "upgrade", "transition", "conversion"],
520 ]
522 title1 = doc1.source_title.lower()
523 title2 = doc2.source_title.lower()
525 for pattern in challenge_patterns:
526 if any(keyword in title1 for keyword in pattern) and any(
527 keyword in title2 for keyword in pattern
528 ):
529 return True
531 return False
533 def _has_transferable_domain_knowledge(
534 self, doc1: SearchResult, doc2: SearchResult
535 ) -> bool:
536 """Check for transferable domain expertise between projects (delegates to CDI helper)."""
537 return cdi_has_transferable_domain_knowledge(doc1, doc2)
539 def _has_reusable_architecture_patterns(
540 self, doc1: SearchResult, doc2: SearchResult
541 ) -> bool:
542 """Identify reusable architecture patterns (delegates to CDI helper)."""
543 return cdi_has_reusable_architecture_patterns(doc1, doc2)
545 def _has_shared_technologies(self, doc1: SearchResult, doc2: SearchResult) -> bool:
546 """Identify shared technologies, frameworks, standards (delegates to CDI helper)."""
547 return cdi_has_shared_technologies(doc1, doc2)
549 def _get_shared_technologies_count(
550 self, doc1: SearchResult, doc2: SearchResult
551 ) -> int:
552 """Count shared technologies between documents (delegates to CDI helper)."""
553 return cdi_get_shared_technologies_count(doc1, doc2)
555 def _enhanced_fallback_scoring(
556 self, target_doc, candidate_doc
557 ) -> tuple[float, str]:
558 """Enhanced fallback when advanced algorithms don't apply."""
559 fallback_score = self._calculate_fallback_score(target_doc, candidate_doc)
560 if fallback_score > 0:
561 return fallback_score, "Basic content similarity"
562 else:
563 return 0.0, "No complementary relationship found"
565 def _calculate_fallback_score(
566 self, target_doc: SearchResult, candidate_doc: SearchResult
567 ) -> float:
568 """Fallback scoring for when advanced methods don't find relationships."""
569 score = 0.0
571 # Just having any shared topics at all
572 if self._has_shared_topics(target_doc, candidate_doc):
573 shared_count = self._get_shared_topics_count(target_doc, candidate_doc)
574 score = max(score, 0.2 + (shared_count * 0.05))
575 self.logger.debug(
576 f"Fallback: {shared_count} shared topics → score: {score:.3f}"
577 )
579 # Just having any shared entities at all
580 if self._has_shared_entities(target_doc, candidate_doc):
581 shared_count = self._get_shared_entities_count(target_doc, candidate_doc)
582 score = max(score, 0.15 + (shared_count * 0.05))
583 self.logger.debug(
584 f"Fallback: {shared_count} shared entities → score: {score:.3f}"
585 )
587 # Simple keyword overlap in titles
588 target_words = set(target_doc.source_title.lower().split())
589 candidate_words = set(candidate_doc.source_title.lower().split())
590 common_words = target_words & candidate_words
591 if len(common_words) > 1: # More than just common words like "the", "and"
592 score = max(score, 0.1 + (len(common_words) * 0.02))
593 self.logger.debug(
594 f"Fallback: {len(common_words)} common words in titles → score: {score:.3f}"
595 )
597 return min(score, 0.5) # Cap fallback scores
599 def _has_shared_entities(self, doc1: SearchResult, doc2: SearchResult) -> bool:
600 """Check if documents have shared entities (delegates to CDI helper)."""
601 return cdi_has_shared_entities(doc1, doc2)
603 def _has_shared_topics(self, doc1: SearchResult, doc2: SearchResult) -> bool:
604 """Check if documents have shared topics (delegates to CDI helper)."""
605 return cdi_has_shared_topics(doc1, doc2)
607 def _get_shared_topics_count(self, doc1: SearchResult, doc2: SearchResult) -> int:
608 """Get the count of shared topics (delegates to CDI helper)."""
609 return cdi_get_shared_topics_count(doc1, doc2)
611 def _get_shared_entities_count(self, doc1: SearchResult, doc2: SearchResult) -> int:
612 """Get the count of shared entities (delegates to CDI helper)."""
613 return cdi_get_shared_entities_count(doc1, doc2)
615 def _has_different_content_complexity(
616 self, doc1: SearchResult, doc2: SearchResult
617 ) -> bool:
618 """Check if documents have different levels of content complexity."""
619 # Compare word counts if available
620 wc1 = int(getattr(doc1, "word_count", 0) or 0)
621 wc2 = int(getattr(doc2, "word_count", 0) or 0)
623 # Guard against None or non-positive counts to avoid ZeroDivisionError
624 if wc1 > 0 and wc2 > 0:
625 ratio = max(wc1, wc2) / min(wc1, wc2)
626 if ratio > 2.0: # One document is significantly longer
627 return True
629 # Compare content features
630 features1 = (doc1.has_code_blocks, doc1.has_tables, doc1.has_images)
631 features2 = (doc2.has_code_blocks, doc2.has_tables, doc2.has_images)
633 # Different if one has technical content and the other doesn't
634 return features1 != features2
636 def _get_complementary_content_type_score(
637 self, target_doc: SearchResult, candidate_doc: SearchResult
638 ) -> float:
639 """Calculate score based on complementary content types."""
640 score = 0.0
642 # Technical + Business complement
643 technical_keywords = [
644 "api",
645 "code",
646 "implementation",
647 "technical",
648 "development",
649 "architecture",
650 ]
651 business_keywords = [
652 "requirements",
653 "business",
654 "specification",
655 "user",
656 "workflow",
657 "process",
658 ]
660 target_title = target_doc.source_title.lower()
661 candidate_title = candidate_doc.source_title.lower()
663 target_is_technical = any(
664 keyword in target_title for keyword in technical_keywords
665 )
666 target_is_business = any(
667 keyword in target_title for keyword in business_keywords
668 )
669 candidate_is_technical = any(
670 keyword in candidate_title for keyword in technical_keywords
671 )
672 candidate_is_business = any(
673 keyword in candidate_title for keyword in business_keywords
674 )
676 # Technical document + Business document = complementary
677 if (target_is_technical and candidate_is_business) or (
678 target_is_business and candidate_is_technical
679 ):
680 score = max(score, 0.7)
682 # Documentation + Implementation complement
683 if (
684 "documentation" in target_title and "implementation" in candidate_title
685 ) or ("implementation" in target_title and "documentation" in candidate_title):
686 score = max(score, 0.6)
688 # Tutorial + Reference complement
689 tutorial_keywords = [
690 "tutorial",
691 "guide",
692 "how-to",
693 "walkthrough",
694 "quick start",
695 ]
696 reference_keywords = ["reference", "api", "specification", "manual", "docs"]
697 target_is_tutorial = any(k in target_title for k in tutorial_keywords)
698 target_is_reference = any(k in target_title for k in reference_keywords)
699 candidate_is_tutorial = any(k in candidate_title for k in tutorial_keywords)
700 candidate_is_reference = any(k in candidate_title for k in reference_keywords)
701 if (target_is_tutorial and candidate_is_reference) or (
702 target_is_reference and candidate_is_tutorial
703 ):
704 score = max(score, 0.6)
706 # Requirements + Design complement
707 if (
708 "requirements" in target_title
709 and ("design" in candidate_title or "architecture" in candidate_title)
710 ) or (
711 ("design" in target_title or "architecture" in target_title)
712 and "requirements" in candidate_title
713 ):
714 score = max(score, 0.6)
716 return score