Coverage for src/qdrant_loader/core/chunking/strategy/base/document_parser.py: 90%
20 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
1"""Base class for document structure analysis."""
3from abc import ABC, abstractmethod
4from typing import TYPE_CHECKING, Any
6if TYPE_CHECKING:
7 pass
10class BaseDocumentParser(ABC):
11 """Base class for document structure analysis.
13 This class defines the interface for parsing document structure and
14 extracting metadata from different document types (text, HTML, code, JSON, etc.).
15 Each strategy should implement its own document parser based on the specific
16 structure and characteristics of the document type.
17 """
19 @abstractmethod
20 def parse_document_structure(self, content: str) -> dict[str, Any]:
21 """Parse document structure and extract structural information.
23 This method should analyze the document content and return a dictionary
24 containing structural information such as:
25 - Document type and format
26 - Hierarchical structure (sections, headings, etc.)
27 - Content statistics (word count, section count, etc.)
28 - Structural metadata specific to the document type
30 Args:
31 content: The document content to analyze
33 Returns:
34 Dictionary containing structural information about the document
36 Raises:
37 NotImplementedError: If the parser doesn't implement this method
38 """
39 raise NotImplementedError(
40 "Document parser must implement parse_document_structure method"
41 )
43 @abstractmethod
44 def extract_section_metadata(self, section: Any) -> dict[str, Any]:
45 """Extract metadata from a document section.
47 This method should extract relevant metadata from a section of the document,
48 such as:
49 - Section type and level
50 - Content characteristics
51 - Structural relationships
52 - Section-specific metadata
54 Args:
55 section: The section object to extract metadata from
57 Returns:
58 Dictionary containing section metadata
60 Raises:
61 NotImplementedError: If the parser doesn't implement this method
62 """
63 raise NotImplementedError(
64 "Document parser must implement extract_section_metadata method"
65 )
67 def extract_section_title(self, content: str) -> str:
68 """Extract section title from content.
70 This is a default implementation that can be overridden by specific parsers
71 to provide better title extraction based on document type.
73 Args:
74 content: The section content
76 Returns:
77 Extracted section title or empty string if none found
78 """
79 lines = content.strip().split("\n")
80 if lines:
81 # Return first non-empty line as title
82 for line in lines:
83 if line.strip():
84 return line.strip()[:100] # Limit title length
85 return ""
87 def analyze_content_characteristics(self, content: str) -> dict[str, Any]:
88 """Analyze general content characteristics.
90 This method provides basic content analysis that can be used by all parsers
91 and extended by specific implementations.
93 Args:
94 content: The content to analyze
96 Returns:
97 Dictionary containing content characteristics
98 """
99 lines = content.split("\n")
100 words = content.split()
102 return {
103 "line_count": len(lines),
104 "word_count": len(words),
105 "character_count": len(content),
106 "non_empty_line_count": len([line for line in lines if line.strip()]),
107 "avg_line_length": (
108 sum(len(line) for line in lines) / len(lines) if lines else 0
109 ),
110 "avg_word_length": (
111 sum(len(word) for word in words) / len(words) if words else 0
112 ),
113 "has_unicode": any(ord(char) > 127 for char in content),
114 }