Coverage for src/qdrant_loader/core/chunking/strategy/html/html_chunk

1"""HTML-specific chunk processor for creating HTML documents with enhanced metadata."""

3from typing import Any

5from qdrant_loader.config import Settings

6from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor

7from qdrant_loader.core.document import Document

9from .html_metadata_extractor import HTMLMetadataExtractor

12class HTMLChunkProcessor(BaseChunkProcessor):

13 """Chunk processor for HTML documents with semantic and accessibility analysis."""

15 def __init__(self, settings: Settings):

16 """Initialize the HTML chunk processor."""

17 super().__init__(settings)

19 # Initialize HTML-specific metadata extractor

20 self.metadata_extractor = HTMLMetadataExtractor()

22 # Get HTML strategy configuration

23 self.html_config = settings.global_config.chunking.strategies.html

24 self.max_chunk_size_for_nlp = self.html_config.max_chunk_size_for_nlp

26 def create_chunk_document(

27 self,

28 original_doc: Document,

29 chunk_content: str,

30 chunk_metadata: dict[str, Any],

31 chunk_index: int,

32 total_chunks: int,

33 skip_nlp: bool = False,

34 ) -> Document:

35 """Create an HTML chunk document with enhanced metadata."""

37 # Generate unique chunk ID

38 chunk_id = Document.generate_chunk_id(original_doc.id, chunk_index)

40 # Extract HTML-specific hierarchical metadata

41 enriched_metadata = self.metadata_extractor.extract_hierarchical_metadata(

42 chunk_content, chunk_metadata, original_doc

43 )

45 # Add chunk-specific metadata

46 enriched_metadata.update(

47 {

48 "chunk_index": chunk_index,

49 "total_chunks": total_chunks,

50 "parent_document_id": original_doc.id,

51 "chunking_strategy": "html_modular",

52 "chunk_size": len(chunk_content),

53 }

54 )

56 # Determine if we should skip NLP processing

57 should_skip_nlp = skip_nlp or self.should_skip_semantic_analysis(

58 chunk_content, enriched_metadata

59 )

61 # Extract entities if NLP is enabled

62 entities = []

63 if not should_skip_nlp:

64 entities = self.metadata_extractor.extract_entities(chunk_content)

65 enriched_metadata["entities"] = entities

66 enriched_metadata["nlp_skipped"] = False

67 else:

68 enriched_metadata["entities"] = []

69 enriched_metadata["nlp_skipped"] = True

70 enriched_metadata["skip_reason"] = self._determine_skip_reason(

71 chunk_content, enriched_metadata

72 )

74 # Create the chunk document

75 chunk_doc = Document(

76 id=chunk_id,

77 content=chunk_content,

78 metadata=enriched_metadata,

79 source=original_doc.source,

80 source_type=original_doc.source_type,

81 url=original_doc.url,

82 content_type=original_doc.content_type,

83 title=self._generate_chunk_title(chunk_content, chunk_index, original_doc),

84 )

86 return chunk_doc

88 def should_skip_semantic_analysis(

89 self, content: str, chunk_metadata: dict[str, Any]

90 ) -> bool:

91 """Determine if semantic analysis should be skipped for HTML content."""

93 # Skip for very large chunks to prevent performance issues

94 if len(content) > self.max_chunk_size_for_nlp:

95 return True

97 # Skip for very small chunks (likely not meaningful)

98 if len(content.strip()) < 50:

99 return True

100

101 # Check if content is mostly HTML markup without substantial text

102 text_content = chunk_metadata.get("text_content", "")

103 if text_content and len(text_content) < len(content) * 0.3:

104 # Less than 30% is actual text content

105 return True

106

107 # Skip for certain HTML section types that are typically non-semantic

108 section_type = chunk_metadata.get("section_type", "")

109 if section_type in ["nav", "aside", "footer"]:

110 # Navigation, sidebars, and footers are usually not meaningful for NLP

111 return True

112

113 # Skip if content is primarily code or script blocks

114 if section_type == "code_block":

115 return True

116

117 # Skip if content has very high markup-to-text ratio

118 markup_ratio = self._calculate_markup_ratio(content, text_content)

119 if markup_ratio > 0.8: # More than 80% markup

120 return True

121

122 # Skip if accessibility score is very low (might indicate poor content)

123 accessibility_score = chunk_metadata.get("accessibility_score", 1.0)

124 if accessibility_score < 0.2:

125 return True

126

127 return False

128

129 def _generate_chunk_title(

130 self, content: str, chunk_index: int, original_doc: Document

131 ) -> str:

132 """Generate a descriptive title for the HTML chunk."""

133 try:

134 # Try to extract title from HTML content using metadata extractor

135 section_title = (

136 self.metadata_extractor.document_parser.extract_section_title(content)

137 )

138

139 if section_title and section_title != "Untitled Section":

140 return f"{section_title} (Chunk {chunk_index + 1})"

141

142 # Fallback to original document title with chunk number

143 if original_doc.title:

144 return f"{original_doc.title} - Chunk {chunk_index + 1}"

145

146 # Ultimate fallback

147 return f"HTML Content - Chunk {chunk_index + 1}"

148

149 except Exception:

150 return f"HTML Content - Chunk {chunk_index + 1}"

151

152 def _determine_skip_reason(

153 self, content: str, chunk_metadata: dict[str, Any]

154 ) -> str:

155 """Determine the specific reason why NLP was skipped."""

156

157 if len(content) > self.max_chunk_size_for_nlp:

158 return f"content_too_large ({len(content)} > {self.max_chunk_size_for_nlp})"

159

160 if len(content.strip()) < 50:

161 return "content_too_small"

162

163 text_content = chunk_metadata.get("text_content", "")

164 if text_content and len(text_content) < len(content) * 0.3:

165 return "low_text_ratio"

166

167 section_type = chunk_metadata.get("section_type", "")

168 if section_type in ["nav", "aside", "footer"]:

169 return f"non_semantic_section ({section_type})"

170

171 if section_type == "code_block":

172 return "code_content"

173

174 markup_ratio = self._calculate_markup_ratio(content, text_content)

175 if markup_ratio > 0.8:

176 return f"high_markup_ratio ({markup_ratio:.2f})"

177

178 accessibility_score = chunk_metadata.get("accessibility_score", 1.0)

179 if accessibility_score < 0.2:

180 return f"low_accessibility_score ({accessibility_score:.2f})"

181

182 return "unknown"

183

184 def _calculate_markup_ratio(self, content: str, text_content: str) -> float:

185 """Calculate the ratio of markup to text content."""

186 if not content:

187 return 0.0

188

189 if not text_content:

190 return 1.0 # All markup, no text

191

192 markup_length = len(content) - len(text_content)

193 return markup_length / len(content) if len(content) > 0 else 0.0

Coverage for src/qdrant_loader/core/chunking/strategy/html/html_chunk_processor.py: 76%

83 statements