Coverage for src/qdrant_loader/core/chunking/strategy/base/document

1"""Base class for document structure analysis."""

3from abc import ABC, abstractmethod

4from typing import TYPE_CHECKING, Any

6if TYPE_CHECKING:

7 pass

10class BaseDocumentParser(ABC):

11 """Base class for document structure analysis.

13 This class defines the interface for parsing document structure and

14 extracting metadata from different document types (text, HTML, code, JSON, etc.).

15 Each strategy should implement its own document parser based on the specific

16 structure and characteristics of the document type.

17 """

19 @abstractmethod

20 def parse_document_structure(self, content: str) -> dict[str, Any]:

21 """Parse document structure and extract structural information.

23 This method should analyze the document content and return a dictionary

24 containing structural information such as:

25 - Document type and format

26 - Hierarchical structure (sections, headings, etc.)

27 - Content statistics (word count, section count, etc.)

28 - Structural metadata specific to the document type

30 Args:

31 content: The document content to analyze

33 Returns:

34 Dictionary containing structural information about the document

36 Raises:

37 NotImplementedError: If the parser doesn't implement this method

38 """

39 raise NotImplementedError(

40 "Document parser must implement parse_document_structure method"

41 )

43 @abstractmethod

44 def extract_section_metadata(self, section: Any) -> dict[str, Any]:

45 """Extract metadata from a document section.

47 This method should extract relevant metadata from a section of the document,

48 such as:

49 - Section type and level

50 - Content characteristics

51 - Structural relationships

52 - Section-specific metadata

54 Args:

55 section: The section object to extract metadata from

57 Returns:

58 Dictionary containing section metadata

60 Raises:

61 NotImplementedError: If the parser doesn't implement this method

62 """

63 raise NotImplementedError(

64 "Document parser must implement extract_section_metadata method"

65 )

67 def extract_section_title(self, content: str) -> str:

68 """Extract section title from content.

70 This is a default implementation that can be overridden by specific parsers

71 to provide better title extraction based on document type.

73 Args:

74 content: The section content

76 Returns:

77 Extracted section title or empty string if none found

78 """

79 lines = content.strip().split("\n")

80 if lines:

81 # Return first non-empty line as title

82 for line in lines:

83 if line.strip():

84 return line.strip()[:100] # Limit title length

85 return ""

87 def analyze_content_characteristics(self, content: str) -> dict[str, Any]:

88 """Analyze general content characteristics.

90 This method provides basic content analysis that can be used by all parsers

91 and extended by specific implementations.

93 Args:

94 content: The content to analyze

96 Returns:

97 Dictionary containing content characteristics

98 """

99 lines = content.split("\n")

100 words = content.split()

101

102 return {

103 "line_count": len(lines),

104 "word_count": len(words),

105 "character_count": len(content),

106 "non_empty_line_count": len([line for line in lines if line.strip()]),

107 "avg_line_length": (

108 sum(len(line) for line in lines) / len(lines) if lines else 0

109 ),

110 "avg_word_length": (

111 sum(len(word) for word in words) / len(words) if words else 0

112 ),

113 "has_unicode": any(ord(char) > 127 for char in content),

114 }

Coverage for src/qdrant_loader/core/chunking/strategy/base/document_parser.py: 90%

20 statements