Coverage for src/qdrant_loader/connectors/localfile/metadata

1import os

2import re

3from typing import Any

5import chardet

7from qdrant_loader.utils.logging import LoggingConfig

10class LocalFileMetadataExtractor:

11 """Extract metadata from local files."""

13 def __init__(self, base_path: str):

14 self.base_path = base_path

15 self.logger = LoggingConfig.get_logger(__name__)

17 def extract_all_metadata(self, file_path: str, content: str) -> dict[str, Any]:

18 self.logger.debug(f"Starting metadata extraction for file: {file_path!s}")

19 file_metadata = self._extract_file_metadata(file_path, content)

20 structure_metadata = {}

21 if file_path.lower().endswith(".md"):

22 structure_metadata = self._extract_structure_metadata(content)

23 metadata = {**file_metadata, **structure_metadata}

24 self.logger.debug(f"Completed metadata extraction for {file_path!s}.")

25 self.logger.debug(f"Metadata: {metadata!s}")

26 return metadata

28 def _extract_file_metadata(self, file_path: str, content: str) -> dict[str, Any]:

29 rel_path = os.path.relpath(file_path, self.base_path)

30 file_type = os.path.splitext(rel_path)[1]

31 file_name = os.path.basename(rel_path)

32 file_encoding = self._detect_encoding(content)

33 line_count = len(content.splitlines())

34 word_count = len(content.split())

35 file_size = len(content.encode(file_encoding))

36 return {

37 "file_type": file_type,

38 "file_name": file_name,

39 "file_directory": os.path.dirname("/" + rel_path),

40 "file_encoding": file_encoding,

41 "line_count": line_count,

42 "word_count": word_count,

43 "file_size": file_size,

44 }

46 def _extract_structure_metadata(self, content: str) -> dict[str, Any]:

47 headings = re.findall(

48 r"(?:^|\n)\s*(#{1,6})\s+(.+?)(?:\n|$)", content, re.MULTILINE

49 )

50 has_toc = "## Table of Contents" in content or "## Contents" in content

51 heading_levels = [len(h[0]) for h in headings]

52 sections_count = len(heading_levels)

53 return {

54 "has_toc": has_toc,

55 "heading_levels": heading_levels,

56 "sections_count": sections_count,

57 }

59 def _detect_encoding(self, content: str) -> str:

60 if not content:

61 return "utf-8"

62 try:

63 result = chardet.detect(content.encode())

64 if (

65 result["encoding"]

66 and result["encoding"].lower() != "ascii"

67 and result["confidence"] > 0.8

68 ):

69 return result["encoding"].lower()

70 except Exception as e:

71 self.logger.error({"event": "Failed to detect encoding", "error": str(e)})

72 return "utf-8"

Coverage for src / qdrant_loader / connectors / localfile / metadata_extractor.py: 91%

44 statements