Coverage for src/qdrant_loader/core/chunking/strategy/base/document_parser.py: 90%

20 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""Base class for document structure analysis.""" 

2 

3from abc import ABC, abstractmethod 

4from typing import TYPE_CHECKING, Any 

5 

6if TYPE_CHECKING: 

7 pass 

8 

9 

10class BaseDocumentParser(ABC): 

11 """Base class for document structure analysis. 

12 

13 This class defines the interface for parsing document structure and 

14 extracting metadata from different document types (text, HTML, code, JSON, etc.). 

15 Each strategy should implement its own document parser based on the specific 

16 structure and characteristics of the document type. 

17 """ 

18 

19 @abstractmethod 

20 def parse_document_structure(self, content: str) -> dict[str, Any]: 

21 """Parse document structure and extract structural information. 

22 

23 This method should analyze the document content and return a dictionary 

24 containing structural information such as: 

25 - Document type and format 

26 - Hierarchical structure (sections, headings, etc.) 

27 - Content statistics (word count, section count, etc.) 

28 - Structural metadata specific to the document type 

29 

30 Args: 

31 content: The document content to analyze 

32 

33 Returns: 

34 Dictionary containing structural information about the document 

35 

36 Raises: 

37 NotImplementedError: If the parser doesn't implement this method 

38 """ 

39 raise NotImplementedError( 

40 "Document parser must implement parse_document_structure method" 

41 ) 

42 

43 @abstractmethod 

44 def extract_section_metadata(self, section: Any) -> dict[str, Any]: 

45 """Extract metadata from a document section. 

46 

47 This method should extract relevant metadata from a section of the document, 

48 such as: 

49 - Section type and level 

50 - Content characteristics 

51 - Structural relationships 

52 - Section-specific metadata 

53 

54 Args: 

55 section: The section object to extract metadata from 

56 

57 Returns: 

58 Dictionary containing section metadata 

59 

60 Raises: 

61 NotImplementedError: If the parser doesn't implement this method 

62 """ 

63 raise NotImplementedError( 

64 "Document parser must implement extract_section_metadata method" 

65 ) 

66 

67 def extract_section_title(self, content: str) -> str: 

68 """Extract section title from content. 

69 

70 This is a default implementation that can be overridden by specific parsers 

71 to provide better title extraction based on document type. 

72 

73 Args: 

74 content: The section content 

75 

76 Returns: 

77 Extracted section title or empty string if none found 

78 """ 

79 lines = content.strip().split("\n") 

80 if lines: 

81 # Return first non-empty line as title 

82 for line in lines: 

83 if line.strip(): 

84 return line.strip()[:100] # Limit title length 

85 return "" 

86 

87 def analyze_content_characteristics(self, content: str) -> dict[str, Any]: 

88 """Analyze general content characteristics. 

89 

90 This method provides basic content analysis that can be used by all parsers 

91 and extended by specific implementations. 

92 

93 Args: 

94 content: The content to analyze 

95 

96 Returns: 

97 Dictionary containing content characteristics 

98 """ 

99 lines = content.split("\n") 

100 words = content.split() 

101 

102 return { 

103 "line_count": len(lines), 

104 "word_count": len(words), 

105 "character_count": len(content), 

106 "non_empty_line_count": len([line for line in lines if line.strip()]), 

107 "avg_line_length": ( 

108 sum(len(line) for line in lines) / len(lines) if lines else 0 

109 ), 

110 "avg_word_length": ( 

111 sum(len(word) for word in words) / len(words) if words else 0 

112 ), 

113 "has_unicode": any(ord(char) > 127 for char in content), 

114 }