Coverage for src/qdrant_loader/core/text_processing/semantic_analyzer.py: 98%
109 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Semantic analysis module for text processing."""
3import logging
4from dataclasses import dataclass
5from typing import Any
7import spacy
8from gensim import corpora
9from gensim.models import LdaModel
10from gensim.parsing.preprocessing import preprocess_string
11from spacy.cli.download import download as spacy_download
12from spacy.tokens import Doc
14logger = logging.getLogger(__name__)
17@dataclass
18class SemanticAnalysisResult:
19 """Container for semantic analysis results."""
21 entities: list[dict[str, Any]]
22 pos_tags: list[dict[str, Any]]
23 dependencies: list[dict[str, Any]]
24 topics: list[dict[str, Any]]
25 key_phrases: list[str]
26 document_similarity: dict[str, float]
29class SemanticAnalyzer:
30 """Advanced semantic analysis for text processing."""
32 def __init__(
33 self,
34 spacy_model: str = "en_core_web_sm",
35 num_topics: int = 5,
36 passes: int = 10,
37 min_topic_freq: int = 2,
38 ):
39 """Initialize the semantic analyzer.
41 Args:
42 spacy_model: Name of the spaCy model to use
43 num_topics: Number of topics for LDA
44 passes: Number of passes for LDA training
45 min_topic_freq: Minimum frequency for topic terms
46 """
47 self.logger = logging.getLogger(__name__)
49 # Initialize spaCy
50 try:
51 self.nlp = spacy.load(spacy_model)
52 except OSError:
53 self.logger.info(f"Downloading spaCy model {spacy_model}...")
54 spacy_download(spacy_model)
55 self.nlp = spacy.load(spacy_model)
57 # Initialize LDA parameters
58 self.num_topics = num_topics
59 self.passes = passes
60 self.min_topic_freq = min_topic_freq
62 # Initialize LDA model
63 self.lda_model = None
64 self.dictionary = None
66 # Cache for processed documents
67 self._doc_cache = {}
69 def analyze_text(
70 self, text: str, doc_id: str | None = None
71 ) -> SemanticAnalysisResult:
72 """Perform comprehensive semantic analysis on text.
74 Args:
75 text: Text to analyze
76 doc_id: Optional document ID for caching
78 Returns:
79 SemanticAnalysisResult containing all analysis results
80 """
81 # Check cache
82 if doc_id and doc_id in self._doc_cache:
83 return self._doc_cache[doc_id]
85 # Process with spaCy
86 doc = self.nlp(text)
88 # Extract entities with linking
89 entities = self._extract_entities(doc)
91 # Get part-of-speech tags
92 pos_tags = self._get_pos_tags(doc)
94 # Get dependency parse
95 dependencies = self._get_dependencies(doc)
97 # Extract topics
98 topics = self._extract_topics(text)
100 # Extract key phrases
101 key_phrases = self._extract_key_phrases(doc)
103 # Calculate document similarity
104 doc_similarity = self._calculate_document_similarity(text)
106 # Create result
107 result = SemanticAnalysisResult(
108 entities=entities,
109 pos_tags=pos_tags,
110 dependencies=dependencies,
111 topics=topics,
112 key_phrases=key_phrases,
113 document_similarity=doc_similarity,
114 )
116 # Cache result
117 if doc_id:
118 self._doc_cache[doc_id] = result
120 return result
122 def _extract_entities(self, doc: Doc) -> list[dict[str, Any]]:
123 """Extract named entities with linking.
125 Args:
126 doc: spaCy document
128 Returns:
129 List of entity dictionaries with linking information
130 """
131 entities = []
132 for ent in doc.ents:
133 # Get entity context
134 start_sent = ent.sent.start
135 end_sent = ent.sent.end
136 context = doc[start_sent:end_sent].text
138 # Get entity description
139 description = self.nlp.vocab.strings[ent.label_]
141 # Get related entities
142 related = []
143 for token in ent.sent:
144 if token.ent_type_ and token.text != ent.text:
145 related.append(
146 {
147 "text": token.text,
148 "type": token.ent_type_,
149 "relation": token.dep_,
150 }
151 )
153 entities.append(
154 {
155 "text": ent.text,
156 "label": ent.label_,
157 "start": ent.start_char,
158 "end": ent.end_char,
159 "description": description,
160 "context": context,
161 "related_entities": related,
162 }
163 )
165 return entities
167 def _get_pos_tags(self, doc: Doc) -> list[dict[str, Any]]:
168 """Get part-of-speech tags with detailed information.
170 Args:
171 doc: spaCy document
173 Returns:
174 List of POS tag dictionaries
175 """
176 pos_tags = []
177 for token in doc:
178 pos_tags.append(
179 {
180 "text": token.text,
181 "pos": token.pos_,
182 "tag": token.tag_,
183 "lemma": token.lemma_,
184 "is_stop": token.is_stop,
185 "is_punct": token.is_punct,
186 "is_space": token.is_space,
187 }
188 )
189 return pos_tags
191 def _get_dependencies(self, doc: Doc) -> list[dict[str, Any]]:
192 """Get dependency parse information.
194 Args:
195 doc: spaCy document
197 Returns:
198 List of dependency dictionaries
199 """
200 dependencies = []
201 for token in doc:
202 dependencies.append(
203 {
204 "text": token.text,
205 "dep": token.dep_,
206 "head": token.head.text,
207 "head_pos": token.head.pos_,
208 "children": [child.text for child in token.children],
209 }
210 )
211 return dependencies
213 def _extract_topics(self, text: str) -> list[dict[str, Any]]:
214 """Extract topics using LDA.
216 Args:
217 text: Text to analyze
219 Returns:
220 List of topic dictionaries
221 """
222 # Preprocess text
223 processed_text = preprocess_string(text)
225 # Create or update dictionary
226 if self.dictionary is None:
227 self.dictionary = corpora.Dictionary([processed_text])
228 else:
229 self.dictionary.add_documents([processed_text])
231 # Create corpus
232 corpus = [self.dictionary.doc2bow(processed_text)]
234 # Train or update LDA model
235 if self.lda_model is None:
236 self.lda_model = LdaModel(
237 corpus,
238 num_topics=self.num_topics,
239 passes=self.passes,
240 id2word=self.dictionary,
241 )
242 else:
243 self.lda_model.update(corpus)
245 # Get topics
246 topics = []
247 for topic_id, topic in self.lda_model.print_topics():
248 # Parse topic terms
249 terms = []
250 for term in topic.split("+"):
251 weight, word = term.strip().split("*")
252 terms.append({"term": word.strip('"'), "weight": float(weight)})
254 topics.append(
255 {
256 "id": topic_id,
257 "terms": terms,
258 "coherence": self._calculate_topic_coherence(terms),
259 }
260 )
262 return topics
264 def _extract_key_phrases(self, doc: Doc) -> list[str]:
265 """Extract key phrases from text.
267 Args:
268 doc: spaCy document
270 Returns:
271 List of key phrases
272 """
273 key_phrases = []
275 # Extract noun phrases
276 for chunk in doc.noun_chunks:
277 if len(chunk.text.split()) >= 2: # Only multi-word phrases
278 key_phrases.append(chunk.text)
280 # Extract named entities
281 for ent in doc.ents:
282 if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART", "LAW"]:
283 key_phrases.append(ent.text)
285 return list(set(key_phrases)) # Remove duplicates
287 def _calculate_document_similarity(self, text: str) -> dict[str, float]:
288 """Calculate similarity with other processed documents.
290 Args:
291 text: Text to compare
293 Returns:
294 Dictionary of document similarities
295 """
296 similarities = {}
297 doc = self.nlp(text)
299 for doc_id, cached_result in self._doc_cache.items():
300 cached_doc = self.nlp(cached_result.entities[0]["context"])
301 similarity = doc.similarity(cached_doc)
302 similarities[doc_id] = float(similarity)
304 return similarities
306 def _calculate_topic_coherence(self, terms: list[dict[str, Any]]) -> float:
307 """Calculate topic coherence score.
309 Args:
310 terms: List of topic terms with weights
312 Returns:
313 Coherence score between 0 and 1
314 """
315 # Simple coherence based on term weights
316 weights = [term["weight"] for term in terms]
317 return sum(weights) / len(weights) if weights else 0.0
319 def clear_cache(self):
320 """Clear the document cache."""
321 self._doc_cache.clear()