Coverage for src/qdrant_loader/connectors/localfile/metadata_extractor.py: 91%

44 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1import os 

2import re 

3from typing import Any 

4 

5import chardet 

6 

7from qdrant_loader.utils.logging import LoggingConfig 

8 

9 

10class LocalFileMetadataExtractor: 

11 """Extract metadata from local files.""" 

12 

13 def __init__(self, base_path: str): 

14 self.base_path = base_path 

15 self.logger = LoggingConfig.get_logger(__name__) 

16 

17 def extract_all_metadata(self, file_path: str, content: str) -> dict[str, Any]: 

18 self.logger.debug(f"Starting metadata extraction for file: {file_path!s}") 

19 file_metadata = self._extract_file_metadata(file_path, content) 

20 structure_metadata = {} 

21 if file_path.lower().endswith(".md"): 

22 structure_metadata = self._extract_structure_metadata(content) 

23 metadata = {**file_metadata, **structure_metadata} 

24 self.logger.debug(f"Completed metadata extraction for {file_path!s}.") 

25 self.logger.debug(f"Metadata: {metadata!s}") 

26 return metadata 

27 

28 def _extract_file_metadata(self, file_path: str, content: str) -> dict[str, Any]: 

29 rel_path = os.path.relpath(file_path, self.base_path) 

30 file_type = os.path.splitext(rel_path)[1] 

31 file_name = os.path.basename(rel_path) 

32 file_encoding = self._detect_encoding(content) 

33 line_count = len(content.splitlines()) 

34 word_count = len(content.split()) 

35 file_size = len(content.encode(file_encoding)) 

36 return { 

37 "file_type": file_type, 

38 "file_name": file_name, 

39 "file_directory": os.path.dirname("/" + rel_path), 

40 "file_encoding": file_encoding, 

41 "line_count": line_count, 

42 "word_count": word_count, 

43 "file_size": file_size, 

44 } 

45 

46 def _extract_structure_metadata(self, content: str) -> dict[str, Any]: 

47 headings = re.findall( 

48 r"(?:^|\n)\s*(#{1,6})\s+(.+?)(?:\n|$)", content, re.MULTILINE 

49 ) 

50 has_toc = "## Table of Contents" in content or "## Contents" in content 

51 heading_levels = [len(h[0]) for h in headings] 

52 sections_count = len(heading_levels) 

53 return { 

54 "has_toc": has_toc, 

55 "heading_levels": heading_levels, 

56 "sections_count": sections_count, 

57 } 

58 

59 def _detect_encoding(self, content: str) -> str: 

60 if not content: 

61 return "utf-8" 

62 try: 

63 result = chardet.detect(content.encode()) 

64 if ( 

65 result["encoding"] 

66 and result["encoding"].lower() != "ascii" 

67 and result["confidence"] > 0.8 

68 ): 

69 return result["encoding"].lower() 

70 except Exception as e: 

71 self.logger.error({"event": "Failed to detect encoding", "error": str(e)}) 

72 return "utf-8"