Coverage for src/qdrant_loader/core/chunking/strategy/html/html_chunk_processor.py: 76%

83 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""HTML-specific chunk processor for creating HTML documents with enhanced metadata.""" 

2 

3from typing import Any 

4 

5from qdrant_loader.config import Settings 

6from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor 

7from qdrant_loader.core.document import Document 

8 

9from .html_metadata_extractor import HTMLMetadataExtractor 

10 

11 

12class HTMLChunkProcessor(BaseChunkProcessor): 

13 """Chunk processor for HTML documents with semantic and accessibility analysis.""" 

14 

15 def __init__(self, settings: Settings): 

16 """Initialize the HTML chunk processor.""" 

17 super().__init__(settings) 

18 

19 # Initialize HTML-specific metadata extractor 

20 self.metadata_extractor = HTMLMetadataExtractor() 

21 

22 # Get HTML strategy configuration 

23 self.html_config = settings.global_config.chunking.strategies.html 

24 self.max_chunk_size_for_nlp = self.html_config.max_chunk_size_for_nlp 

25 

26 def create_chunk_document( 

27 self, 

28 original_doc: Document, 

29 chunk_content: str, 

30 chunk_metadata: dict[str, Any], 

31 chunk_index: int, 

32 total_chunks: int, 

33 skip_nlp: bool = False, 

34 ) -> Document: 

35 """Create an HTML chunk document with enhanced metadata.""" 

36 

37 # Generate unique chunk ID 

38 chunk_id = Document.generate_chunk_id(original_doc.id, chunk_index) 

39 

40 # Extract HTML-specific hierarchical metadata 

41 enriched_metadata = self.metadata_extractor.extract_hierarchical_metadata( 

42 chunk_content, chunk_metadata, original_doc 

43 ) 

44 

45 # Add chunk-specific metadata 

46 enriched_metadata.update( 

47 { 

48 "chunk_index": chunk_index, 

49 "total_chunks": total_chunks, 

50 "parent_document_id": original_doc.id, 

51 "chunking_strategy": "html_modular", 

52 "chunk_size": len(chunk_content), 

53 } 

54 ) 

55 

56 # Determine if we should skip NLP processing 

57 should_skip_nlp = skip_nlp or self.should_skip_semantic_analysis( 

58 chunk_content, enriched_metadata 

59 ) 

60 

61 # Extract entities if NLP is enabled 

62 entities = [] 

63 if not should_skip_nlp: 

64 entities = self.metadata_extractor.extract_entities(chunk_content) 

65 enriched_metadata["entities"] = entities 

66 enriched_metadata["nlp_skipped"] = False 

67 else: 

68 enriched_metadata["entities"] = [] 

69 enriched_metadata["nlp_skipped"] = True 

70 enriched_metadata["skip_reason"] = self._determine_skip_reason( 

71 chunk_content, enriched_metadata 

72 ) 

73 

74 # Create the chunk document 

75 chunk_doc = Document( 

76 id=chunk_id, 

77 content=chunk_content, 

78 metadata=enriched_metadata, 

79 source=original_doc.source, 

80 source_type=original_doc.source_type, 

81 url=original_doc.url, 

82 content_type=original_doc.content_type, 

83 title=self._generate_chunk_title(chunk_content, chunk_index, original_doc), 

84 ) 

85 

86 return chunk_doc 

87 

88 def should_skip_semantic_analysis( 

89 self, content: str, chunk_metadata: dict[str, Any] 

90 ) -> bool: 

91 """Determine if semantic analysis should be skipped for HTML content.""" 

92 

93 # Skip for very large chunks to prevent performance issues 

94 if len(content) > self.max_chunk_size_for_nlp: 

95 return True 

96 

97 # Skip for very small chunks (likely not meaningful) 

98 if len(content.strip()) < 50: 

99 return True 

100 

101 # Check if content is mostly HTML markup without substantial text 

102 text_content = chunk_metadata.get("text_content", "") 

103 if text_content and len(text_content) < len(content) * 0.3: 

104 # Less than 30% is actual text content 

105 return True 

106 

107 # Skip for certain HTML section types that are typically non-semantic 

108 section_type = chunk_metadata.get("section_type", "") 

109 if section_type in ["nav", "aside", "footer"]: 

110 # Navigation, sidebars, and footers are usually not meaningful for NLP 

111 return True 

112 

113 # Skip if content is primarily code or script blocks 

114 if section_type == "code_block": 

115 return True 

116 

117 # Skip if content has very high markup-to-text ratio 

118 markup_ratio = self._calculate_markup_ratio(content, text_content) 

119 if markup_ratio > 0.8: # More than 80% markup 

120 return True 

121 

122 # Skip if accessibility score is very low (might indicate poor content) 

123 accessibility_score = chunk_metadata.get("accessibility_score", 1.0) 

124 if accessibility_score < 0.2: 

125 return True 

126 

127 return False 

128 

129 def _generate_chunk_title( 

130 self, content: str, chunk_index: int, original_doc: Document 

131 ) -> str: 

132 """Generate a descriptive title for the HTML chunk.""" 

133 try: 

134 # Try to extract title from HTML content using metadata extractor 

135 section_title = ( 

136 self.metadata_extractor.document_parser.extract_section_title(content) 

137 ) 

138 

139 if section_title and section_title != "Untitled Section": 

140 return f"{section_title} (Chunk {chunk_index + 1})" 

141 

142 # Fallback to original document title with chunk number 

143 if original_doc.title: 

144 return f"{original_doc.title} - Chunk {chunk_index + 1}" 

145 

146 # Ultimate fallback 

147 return f"HTML Content - Chunk {chunk_index + 1}" 

148 

149 except Exception: 

150 return f"HTML Content - Chunk {chunk_index + 1}" 

151 

152 def _determine_skip_reason( 

153 self, content: str, chunk_metadata: dict[str, Any] 

154 ) -> str: 

155 """Determine the specific reason why NLP was skipped.""" 

156 

157 if len(content) > self.max_chunk_size_for_nlp: 

158 return f"content_too_large ({len(content)} > {self.max_chunk_size_for_nlp})" 

159 

160 if len(content.strip()) < 50: 

161 return "content_too_small" 

162 

163 text_content = chunk_metadata.get("text_content", "") 

164 if text_content and len(text_content) < len(content) * 0.3: 

165 return "low_text_ratio" 

166 

167 section_type = chunk_metadata.get("section_type", "") 

168 if section_type in ["nav", "aside", "footer"]: 

169 return f"non_semantic_section ({section_type})" 

170 

171 if section_type == "code_block": 

172 return "code_content" 

173 

174 markup_ratio = self._calculate_markup_ratio(content, text_content) 

175 if markup_ratio > 0.8: 

176 return f"high_markup_ratio ({markup_ratio:.2f})" 

177 

178 accessibility_score = chunk_metadata.get("accessibility_score", 1.0) 

179 if accessibility_score < 0.2: 

180 return f"low_accessibility_score ({accessibility_score:.2f})" 

181 

182 return "unknown" 

183 

184 def _calculate_markup_ratio(self, content: str, text_content: str) -> float: 

185 """Calculate the ratio of markup to text content.""" 

186 if not content: 

187 return 0.0 

188 

189 if not text_content: 

190 return 1.0 # All markup, no text 

191 

192 markup_length = len(content) - len(text_content) 

193 return markup_length / len(content) if len(content) > 0 else 0.0