Coverage for src/qdrant_loader/core/text_processing/semantic_analyzer.py: 98%

109 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Semantic analysis module for text processing.""" 

2 

3import logging 

4from dataclasses import dataclass 

5from typing import Any 

6 

7import spacy 

8from gensim import corpora 

9from gensim.models import LdaModel 

10from gensim.parsing.preprocessing import preprocess_string 

11from spacy.cli.download import download as spacy_download 

12from spacy.tokens import Doc 

13 

14logger = logging.getLogger(__name__) 

15 

16 

17@dataclass 

18class SemanticAnalysisResult: 

19 """Container for semantic analysis results.""" 

20 

21 entities: list[dict[str, Any]] 

22 pos_tags: list[dict[str, Any]] 

23 dependencies: list[dict[str, Any]] 

24 topics: list[dict[str, Any]] 

25 key_phrases: list[str] 

26 document_similarity: dict[str, float] 

27 

28 

29class SemanticAnalyzer: 

30 """Advanced semantic analysis for text processing.""" 

31 

32 def __init__( 

33 self, 

34 spacy_model: str = "en_core_web_sm", 

35 num_topics: int = 5, 

36 passes: int = 10, 

37 min_topic_freq: int = 2, 

38 ): 

39 """Initialize the semantic analyzer. 

40 

41 Args: 

42 spacy_model: Name of the spaCy model to use 

43 num_topics: Number of topics for LDA 

44 passes: Number of passes for LDA training 

45 min_topic_freq: Minimum frequency for topic terms 

46 """ 

47 self.logger = logging.getLogger(__name__) 

48 

49 # Initialize spaCy 

50 try: 

51 self.nlp = spacy.load(spacy_model) 

52 except OSError: 

53 self.logger.info(f"Downloading spaCy model {spacy_model}...") 

54 spacy_download(spacy_model) 

55 self.nlp = spacy.load(spacy_model) 

56 

57 # Initialize LDA parameters 

58 self.num_topics = num_topics 

59 self.passes = passes 

60 self.min_topic_freq = min_topic_freq 

61 

62 # Initialize LDA model 

63 self.lda_model = None 

64 self.dictionary = None 

65 

66 # Cache for processed documents 

67 self._doc_cache = {} 

68 

69 def analyze_text( 

70 self, text: str, doc_id: str | None = None 

71 ) -> SemanticAnalysisResult: 

72 """Perform comprehensive semantic analysis on text. 

73 

74 Args: 

75 text: Text to analyze 

76 doc_id: Optional document ID for caching 

77 

78 Returns: 

79 SemanticAnalysisResult containing all analysis results 

80 """ 

81 # Check cache 

82 if doc_id and doc_id in self._doc_cache: 

83 return self._doc_cache[doc_id] 

84 

85 # Process with spaCy 

86 doc = self.nlp(text) 

87 

88 # Extract entities with linking 

89 entities = self._extract_entities(doc) 

90 

91 # Get part-of-speech tags 

92 pos_tags = self._get_pos_tags(doc) 

93 

94 # Get dependency parse 

95 dependencies = self._get_dependencies(doc) 

96 

97 # Extract topics 

98 topics = self._extract_topics(text) 

99 

100 # Extract key phrases 

101 key_phrases = self._extract_key_phrases(doc) 

102 

103 # Calculate document similarity 

104 doc_similarity = self._calculate_document_similarity(text) 

105 

106 # Create result 

107 result = SemanticAnalysisResult( 

108 entities=entities, 

109 pos_tags=pos_tags, 

110 dependencies=dependencies, 

111 topics=topics, 

112 key_phrases=key_phrases, 

113 document_similarity=doc_similarity, 

114 ) 

115 

116 # Cache result 

117 if doc_id: 

118 self._doc_cache[doc_id] = result 

119 

120 return result 

121 

122 def _extract_entities(self, doc: Doc) -> list[dict[str, Any]]: 

123 """Extract named entities with linking. 

124 

125 Args: 

126 doc: spaCy document 

127 

128 Returns: 

129 List of entity dictionaries with linking information 

130 """ 

131 entities = [] 

132 for ent in doc.ents: 

133 # Get entity context 

134 start_sent = ent.sent.start 

135 end_sent = ent.sent.end 

136 context = doc[start_sent:end_sent].text 

137 

138 # Get entity description 

139 description = self.nlp.vocab.strings[ent.label_] 

140 

141 # Get related entities 

142 related = [] 

143 for token in ent.sent: 

144 if token.ent_type_ and token.text != ent.text: 

145 related.append( 

146 { 

147 "text": token.text, 

148 "type": token.ent_type_, 

149 "relation": token.dep_, 

150 } 

151 ) 

152 

153 entities.append( 

154 { 

155 "text": ent.text, 

156 "label": ent.label_, 

157 "start": ent.start_char, 

158 "end": ent.end_char, 

159 "description": description, 

160 "context": context, 

161 "related_entities": related, 

162 } 

163 ) 

164 

165 return entities 

166 

167 def _get_pos_tags(self, doc: Doc) -> list[dict[str, Any]]: 

168 """Get part-of-speech tags with detailed information. 

169 

170 Args: 

171 doc: spaCy document 

172 

173 Returns: 

174 List of POS tag dictionaries 

175 """ 

176 pos_tags = [] 

177 for token in doc: 

178 pos_tags.append( 

179 { 

180 "text": token.text, 

181 "pos": token.pos_, 

182 "tag": token.tag_, 

183 "lemma": token.lemma_, 

184 "is_stop": token.is_stop, 

185 "is_punct": token.is_punct, 

186 "is_space": token.is_space, 

187 } 

188 ) 

189 return pos_tags 

190 

191 def _get_dependencies(self, doc: Doc) -> list[dict[str, Any]]: 

192 """Get dependency parse information. 

193 

194 Args: 

195 doc: spaCy document 

196 

197 Returns: 

198 List of dependency dictionaries 

199 """ 

200 dependencies = [] 

201 for token in doc: 

202 dependencies.append( 

203 { 

204 "text": token.text, 

205 "dep": token.dep_, 

206 "head": token.head.text, 

207 "head_pos": token.head.pos_, 

208 "children": [child.text for child in token.children], 

209 } 

210 ) 

211 return dependencies 

212 

213 def _extract_topics(self, text: str) -> list[dict[str, Any]]: 

214 """Extract topics using LDA. 

215 

216 Args: 

217 text: Text to analyze 

218 

219 Returns: 

220 List of topic dictionaries 

221 """ 

222 # Preprocess text 

223 processed_text = preprocess_string(text) 

224 

225 # Create or update dictionary 

226 if self.dictionary is None: 

227 self.dictionary = corpora.Dictionary([processed_text]) 

228 else: 

229 self.dictionary.add_documents([processed_text]) 

230 

231 # Create corpus 

232 corpus = [self.dictionary.doc2bow(processed_text)] 

233 

234 # Train or update LDA model 

235 if self.lda_model is None: 

236 self.lda_model = LdaModel( 

237 corpus, 

238 num_topics=self.num_topics, 

239 passes=self.passes, 

240 id2word=self.dictionary, 

241 ) 

242 else: 

243 self.lda_model.update(corpus) 

244 

245 # Get topics 

246 topics = [] 

247 for topic_id, topic in self.lda_model.print_topics(): 

248 # Parse topic terms 

249 terms = [] 

250 for term in topic.split("+"): 

251 weight, word = term.strip().split("*") 

252 terms.append({"term": word.strip('"'), "weight": float(weight)}) 

253 

254 topics.append( 

255 { 

256 "id": topic_id, 

257 "terms": terms, 

258 "coherence": self._calculate_topic_coherence(terms), 

259 } 

260 ) 

261 

262 return topics 

263 

264 def _extract_key_phrases(self, doc: Doc) -> list[str]: 

265 """Extract key phrases from text. 

266 

267 Args: 

268 doc: spaCy document 

269 

270 Returns: 

271 List of key phrases 

272 """ 

273 key_phrases = [] 

274 

275 # Extract noun phrases 

276 for chunk in doc.noun_chunks: 

277 if len(chunk.text.split()) >= 2: # Only multi-word phrases 

278 key_phrases.append(chunk.text) 

279 

280 # Extract named entities 

281 for ent in doc.ents: 

282 if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART", "LAW"]: 

283 key_phrases.append(ent.text) 

284 

285 return list(set(key_phrases)) # Remove duplicates 

286 

287 def _calculate_document_similarity(self, text: str) -> dict[str, float]: 

288 """Calculate similarity with other processed documents. 

289 

290 Args: 

291 text: Text to compare 

292 

293 Returns: 

294 Dictionary of document similarities 

295 """ 

296 similarities = {} 

297 doc = self.nlp(text) 

298 

299 for doc_id, cached_result in self._doc_cache.items(): 

300 cached_doc = self.nlp(cached_result.entities[0]["context"]) 

301 similarity = doc.similarity(cached_doc) 

302 similarities[doc_id] = float(similarity) 

303 

304 return similarities 

305 

306 def _calculate_topic_coherence(self, terms: list[dict[str, Any]]) -> float: 

307 """Calculate topic coherence score. 

308 

309 Args: 

310 terms: List of topic terms with weights 

311 

312 Returns: 

313 Coherence score between 0 and 1 

314 """ 

315 # Simple coherence based on term weights 

316 weights = [term["weight"] for term in terms] 

317 return sum(weights) / len(weights) if weights else 0.0 

318 

319 def clear_cache(self): 

320 """Clear the document cache.""" 

321 self._doc_cache.clear()